library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(caret)
## Loading required package: lattice
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(e1071)
## Warning: package 'e1071' was built under R version 4.4.1
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
insurance_training <- read.csv("insurance_training_data.csv")
insurance_evaluation <- read.csv("insurance-evaluation-data.csv")
# Quick exploration
str(insurance_training) # Structure of data
## 'data.frame': 8161 obs. of 26 variables:
## $ INDEX : int 1 2 4 5 6 7 8 11 12 13 ...
## $ TARGET_FLAG: int 0 0 0 0 0 1 0 1 1 0 ...
## $ TARGET_AMT : num 0 0 0 0 0 ...
## $ KIDSDRIV : int 0 0 0 0 0 0 0 1 0 0 ...
## $ AGE : int 60 43 35 51 50 34 54 37 34 50 ...
## $ HOMEKIDS : int 0 0 1 0 0 1 0 2 0 0 ...
## $ YOJ : int 11 11 10 14 NA 12 NA NA 10 7 ...
## $ INCOME : chr "$67,349" "$91,449" "$16,039" "" ...
## $ PARENT1 : chr "No" "No" "No" "No" ...
## $ HOME_VAL : chr "$0" "$257,252" "$124,191" "$306,251" ...
## $ MSTATUS : chr "z_No" "z_No" "Yes" "Yes" ...
## $ SEX : chr "M" "M" "z_F" "M" ...
## $ EDUCATION : chr "PhD" "z_High School" "z_High School" "<High School" ...
## $ JOB : chr "Professional" "z_Blue Collar" "Clerical" "z_Blue Collar" ...
## $ TRAVTIME : int 14 22 5 32 36 46 33 44 34 48 ...
## $ CAR_USE : chr "Private" "Commercial" "Private" "Private" ...
## $ BLUEBOOK : chr "$14,230" "$14,940" "$4,010" "$15,440" ...
## $ TIF : int 11 1 4 7 1 1 1 1 1 7 ...
## $ CAR_TYPE : chr "Minivan" "Minivan" "z_SUV" "Minivan" ...
## $ RED_CAR : chr "yes" "yes" "no" "yes" ...
## $ OLDCLAIM : chr "$4,461" "$0" "$38,690" "$0" ...
## $ CLM_FREQ : int 2 0 2 0 2 0 0 1 0 0 ...
## $ REVOKED : chr "No" "No" "No" "No" ...
## $ MVR_PTS : int 3 0 3 0 3 0 0 10 0 1 ...
## $ CAR_AGE : int 18 1 10 6 17 7 1 7 1 17 ...
## $ URBANICITY : chr "Highly Urban/ Urban" "Highly Urban/ Urban" "Highly Urban/ Urban" "Highly Urban/ Urban" ...
summary(insurance_training) # Summary statistics
## INDEX TARGET_FLAG TARGET_AMT KIDSDRIV
## Min. : 1 Min. :0.0000 Min. : 0 Min. :0.0000
## 1st Qu.: 2559 1st Qu.:0.0000 1st Qu.: 0 1st Qu.:0.0000
## Median : 5133 Median :0.0000 Median : 0 Median :0.0000
## Mean : 5152 Mean :0.2638 Mean : 1504 Mean :0.1711
## 3rd Qu.: 7745 3rd Qu.:1.0000 3rd Qu.: 1036 3rd Qu.:0.0000
## Max. :10302 Max. :1.0000 Max. :107586 Max. :4.0000
##
## AGE HOMEKIDS YOJ INCOME
## Min. :16.00 Min. :0.0000 Min. : 0.0 Length:8161
## 1st Qu.:39.00 1st Qu.:0.0000 1st Qu.: 9.0 Class :character
## Median :45.00 Median :0.0000 Median :11.0 Mode :character
## Mean :44.79 Mean :0.7212 Mean :10.5
## 3rd Qu.:51.00 3rd Qu.:1.0000 3rd Qu.:13.0
## Max. :81.00 Max. :5.0000 Max. :23.0
## NA's :6 NA's :454
## PARENT1 HOME_VAL MSTATUS SEX
## Length:8161 Length:8161 Length:8161 Length:8161
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## EDUCATION JOB TRAVTIME CAR_USE
## Length:8161 Length:8161 Min. : 5.00 Length:8161
## Class :character Class :character 1st Qu.: 22.00 Class :character
## Mode :character Mode :character Median : 33.00 Mode :character
## Mean : 33.49
## 3rd Qu.: 44.00
## Max. :142.00
##
## BLUEBOOK TIF CAR_TYPE RED_CAR
## Length:8161 Min. : 1.000 Length:8161 Length:8161
## Class :character 1st Qu.: 1.000 Class :character Class :character
## Mode :character Median : 4.000 Mode :character Mode :character
## Mean : 5.351
## 3rd Qu.: 7.000
## Max. :25.000
##
## OLDCLAIM CLM_FREQ REVOKED MVR_PTS
## Length:8161 Min. :0.0000 Length:8161 Min. : 0.000
## Class :character 1st Qu.:0.0000 Class :character 1st Qu.: 0.000
## Mode :character Median :0.0000 Mode :character Median : 1.000
## Mean :0.7986 Mean : 1.696
## 3rd Qu.:2.0000 3rd Qu.: 3.000
## Max. :5.0000 Max. :13.000
##
## CAR_AGE URBANICITY
## Min. :-3.000 Length:8161
## 1st Qu.: 1.000 Class :character
## Median : 8.000 Mode :character
## Mean : 8.328
## 3rd Qu.:12.000
## Max. :28.000
## NA's :510
# Check for missing values
colSums(is.na(insurance_training))
## INDEX TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS
## 0 0 0 0 6 0
## YOJ INCOME PARENT1 HOME_VAL MSTATUS SEX
## 454 0 0 0 0 0
## EDUCATION JOB TRAVTIME CAR_USE BLUEBOOK TIF
## 0 0 0 0 0 0
## CAR_TYPE RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS
## 0 0 0 0 0 0
## CAR_AGE URBANICITY
## 510 0
The data has 8161 observations and 25 variables (excluding the
INDEX
which won’t be used for the analysis).
The primary target variable is TARGET_FLAG
, a binary
indicator representing whether a car was in crash, and the secondary
target TARGET_AMT
indicates the amount of the cost if a car
was in crash.
AGE
has a mean of 44.8 years (SD = 14.3) with a median
age of 45, indicating a balanced age distribution. TRAVTIME
(commute time to work) averages 33.5 minutes, with most values clustered
between 22 and 44 minutes. A full table of key statistics is included
above for reference.
Several variables have missing values:
AGE
(6 missing values), YOJ
(454),
INCOME
(many blanks), and CAR_AGE
(510). We
are going to apply imputation strategies to address these gaps. Missing
AGE values will be replaced with the median (45 years).
YOJ
and CAR_AGE
will be imputed using
their median values (11 and 8 years, respectively). INCOME
,
recorded as character strings, will be cleaned and converted to numeric,
with missing values replaced by the median.
# Histogram for AGE
ggplot(insurance_training, aes(x = AGE)) +
geom_histogram(bins = 30, fill = "skyblue", color = "black") +
labs(title = "Age Distribution", x = "Age", y = "Frequency")
## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Boxplot for CAR_AGE
ggplot(insurance_training, aes(y = CAR_AGE)) +
geom_boxplot(fill = "lightgreen") +
labs(title = "Boxplot of CAR_AGE", y = "Car Age")
## Warning: Removed 510 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
# Remove rows with missing values
numeric_data <- na.omit(insurance_training[, sapply(insurance_training, is.numeric)])
# Checking correlation between numeric features
numeric_vars <- sapply(insurance_training, is.numeric)
correlation_matrix <- cor(insurance_training[, numeric_vars], use = "pairwise.complete.obs")
print(correlation_matrix)
## INDEX TARGET_FLAG TARGET_AMT KIDSDRIV AGE
## INDEX 1.0000000000 -0.001669645 -0.0005934765 0.015575660 3.384609e-02
## TARGET_FLAG -0.0016696445 1.000000000 0.5342460609 0.103668296 -1.032167e-01
## TARGET_AMT -0.0005934765 0.534246061 1.0000000000 0.055394177 -4.172832e-02
## KIDSDRIV 0.0155756605 0.103668296 0.0553941768 1.000000000 -7.517883e-02
## AGE 0.0338460913 -0.103216708 -0.0417283235 -0.075178828 1.000000e+00
## HOMEKIDS 0.0000521436 0.115621011 0.0619880434 0.464015239 -4.454410e-01
## YOJ 0.0267332475 -0.070511825 -0.0220851981 0.043304800 1.360725e-01
## TRAVTIME -0.0230701278 0.048368310 0.0279870160 0.008447299 5.269488e-03
## TIF -0.0092139808 -0.082370050 -0.0464808306 -0.001988715 -6.631285e-05
## CLM_FREQ 0.0187802762 0.216196061 0.1164191586 0.037062929 -2.409232e-02
## MVR_PTS 0.0078825302 0.219197054 0.1378655086 0.053566373 -7.157543e-02
## CAR_AGE -0.0006986235 -0.100650615 -0.0588221106 -0.053993001 1.762208e-01
## HOMEKIDS YOJ TRAVTIME TIF CLM_FREQ
## INDEX 0.0000521436 0.02673325 -0.023070128 -9.213981e-03 0.018780276
## TARGET_FLAG 0.1156210106 -0.07051183 0.048368310 -8.237005e-02 0.216196061
## TARGET_AMT 0.0619880434 -0.02208520 0.027987016 -4.648083e-02 0.116419159
## KIDSDRIV 0.4640152389 0.04330480 0.008447299 -1.988715e-03 0.037062929
## AGE -0.4454410402 0.13607248 0.005269488 -6.631285e-05 -0.024092321
## HOMEKIDS 1.0000000000 0.08682902 -0.007245604 1.181332e-02 0.029349289
## YOJ 0.0868290246 1.00000000 -0.016945311 2.478659e-02 -0.026308028
## TRAVTIME -0.0072456039 -0.01694531 1.000000000 -1.160463e-02 0.006560211
## TIF 0.0118133187 0.02478659 -0.011604626 1.000000e+00 -0.023022955
## CLM_FREQ 0.0293492894 -0.02630803 0.006560211 -2.302295e-02 1.000000000
## MVR_PTS 0.0606013438 -0.03785508 0.010598511 -4.104573e-02 0.396638373
## CAR_AGE -0.1521463981 0.06140648 -0.038232806 7.767352e-03 -0.009318765
## MVR_PTS CAR_AGE
## INDEX 0.00788253 -0.0006986235
## TARGET_FLAG 0.21919705 -0.1006506150
## TARGET_AMT 0.13786551 -0.0588221106
## KIDSDRIV 0.05356637 -0.0539930013
## AGE -0.07157543 0.1762208228
## HOMEKIDS 0.06060134 -0.1521463981
## YOJ -0.03785508 0.0614064819
## TRAVTIME 0.01059851 -0.0382328060
## TIF -0.04104573 0.0077673523
## CLM_FREQ 0.39663837 -0.0093187652
## MVR_PTS 1.00000000 -0.0199040729
## CAR_AGE -0.01990407 1.0000000000
The histogram for the variable AGE
shows an
approximately normal distribution, with the highest frequency occurring
around the median age of 45.
For CAR_AGE
, outliers were evident, with the minimum
value being -4, indicating that a negtive age doesn’t make sense.
A correlation matrix highlighted several key relationships:
The target variable TARGET_FLAG
has a moderate
positive correlation with MVR_PTS
(Motor Vehicle Record
Points, 0.22) and CLM_FREQ
(Claim Frequency, 0.22). These
variables are strong candidates for inclusion in the logistic regression
model.
AGE
and CAR_AGE
exhibit weak negative
correlations with TARGET_FLAG
(-0.10 each), suggesting
limited predictive value.
TARGET_AMT
is moderately correlated with
TARGET_FLAG
(0.53), as expected, since claim amounts/cost
depend on whether a claim was filed for a crashed car.
# Bar plot for CAR_TYPE
ggplot(insurance_training, aes(x = CAR_TYPE)) +
geom_bar(fill = "purple") +
labs(title = "Car Type Distribution", x = "Car Type", y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Bar plot for TARGET_FLAG
ggplot(insurance_training, aes(x = factor(TARGET_FLAG))) +
geom_bar(fill = "orange") +
labs(title = "Target Flag Distribution", x = "Target Flag (0 = No Crash, 1 = Crash)", y = "Count")
Vehicle Types: The dataset includes a variety of vehicle categories,
with the most common being SUVs (Z_SUV), accounting for 2,260 entries,
followed by:
Minivans: 2,020 entries
Pickup Trucks: 1,375 entries
Sports Cars: 875 entries
Vans: 750 entries
Panel Trucks: 700 entries
This distribution highlights a predominance of family-oriented and utility vehicles, potentially influencing claim tendencies.
Target Flag Distribution: The target variable TARGET_FLAG has a highly imbalanced distribution:
No Crash (0): 6,000 instances (approximately 73.5% of the data).
Crash (1): 2,100 instances (approximately 26.5%).
The imbalance will be considered in model development, potentially requiring techniques like weighting, oversampling, or undersampling to ensure accurate prediction.
# identify missing values
colSums(is.na(insurance_training))
## INDEX TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS
## 0 0 0 0 6 0
## YOJ INCOME PARENT1 HOME_VAL MSTATUS SEX
## 454 0 0 0 0 0
## EDUCATION JOB TRAVTIME CAR_USE BLUEBOOK TIF
## 0 0 0 0 0 0
## CAR_TYPE RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS
## 0 0 0 0 0 0
## CAR_AGE URBANICITY
## 510 0
Since the variables that have missing values are numeric, we are going to impute the missing values using the mean.
# Impute missing values for numerical variables with mean
numeric_vars <- sapply(insurance_training, is.numeric)
insurance_training[numeric_vars] <- lapply(insurance_training[numeric_vars], function(x) ifelse(is.na(x), mean(x, na.rm = TRUE), x))
# Loop through all variables to create flags for missing values
for (var in colnames(insurance_training)) {
insurance_training[paste0(var, "_FLAG")] <- ifelse(is.na(insurance_training[[var]]), 1, 0)
}
# Check the new flags columns
head(insurance_training)
## INDEX TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ INCOME PARENT1
## 1 1 0 0 0 60 0 11.00000 $67,349 No
## 2 2 0 0 0 43 0 11.00000 $91,449 No
## 3 4 0 0 0 35 1 10.00000 $16,039 No
## 4 5 0 0 0 51 0 14.00000 No
## 5 6 0 0 0 50 0 10.49929 $114,986 No
## 6 7 1 2946 0 34 1 12.00000 $125,301 Yes
## HOME_VAL MSTATUS SEX EDUCATION JOB TRAVTIME CAR_USE BLUEBOOK
## 1 $0 z_No M PhD Professional 14 Private $14,230
## 2 $257,252 z_No M z_High School z_Blue Collar 22 Commercial $14,940
## 3 $124,191 Yes z_F z_High School Clerical 5 Private $4,010
## 4 $306,251 Yes M <High School z_Blue Collar 32 Private $15,440
## 5 $243,925 Yes z_F PhD Doctor 36 Private $18,000
## 6 $0 z_No z_F Bachelors z_Blue Collar 46 Commercial $17,430
## TIF CAR_TYPE RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS CAR_AGE
## 1 11 Minivan yes $4,461 2 No 3 18
## 2 1 Minivan yes $0 0 No 0 1
## 3 4 z_SUV no $38,690 2 No 3 10
## 4 7 Minivan yes $0 0 No 0 6
## 5 1 z_SUV no $19,217 2 Yes 3 17
## 6 1 Sports Car no $0 0 No 0 7
## URBANICITY INDEX_FLAG TARGET_FLAG_FLAG TARGET_AMT_FLAG KIDSDRIV_FLAG
## 1 Highly Urban/ Urban 0 0 0 0
## 2 Highly Urban/ Urban 0 0 0 0
## 3 Highly Urban/ Urban 0 0 0 0
## 4 Highly Urban/ Urban 0 0 0 0
## 5 Highly Urban/ Urban 0 0 0 0
## 6 Highly Urban/ Urban 0 0 0 0
## AGE_FLAG HOMEKIDS_FLAG YOJ_FLAG INCOME_FLAG PARENT1_FLAG HOME_VAL_FLAG
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 0 0 0 0 0 0
## 6 0 0 0 0 0 0
## MSTATUS_FLAG SEX_FLAG EDUCATION_FLAG JOB_FLAG TRAVTIME_FLAG CAR_USE_FLAG
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 0 0 0 0 0 0
## 6 0 0 0 0 0 0
## BLUEBOOK_FLAG TIF_FLAG CAR_TYPE_FLAG RED_CAR_FLAG OLDCLAIM_FLAG CLM_FREQ_FLAG
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 0 0 0 0 0 0
## 6 0 0 0 0 0 0
## REVOKED_FLAG MVR_PTS_FLAG CAR_AGE_FLAG URBANICITY_FLAG
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
The paste0(var, “_FLAG”) dynamically creates the name for the new flag column based on the original variable name (e.g., if the original variable is AGE, the flag column will be AGE_FLAG).
ifelse(is.na(insurance_training[[var]]), 1, 0) checks if the value is missing (NA), and if it is, it assigns a 1; otherwise, it assigns a 0.
In this sub-section, we are going to bucketize the continuous
variables; AGE
and TARGET_AMT
:
# Bucketize AGE into ranges
insurance_training$AGE_BUCKET <- cut(insurance_training$AGE,
breaks = c(18, 30, 50, 70, Inf),
labels = c("18-30", "31-50", "51-70", "70+"))
# Bucketize TARGET_AMT into categories
insurance_training$TARGET_AMT_BUCKET <- cut(insurance_training$TARGET_AMT,
breaks = c(0, 1000, 5000, 10000, Inf),
labels = c("0-1000", "1001-5000", "5001-10000", "10000+"))
# Check the bucketized varaibles
table(insurance_training$AGE_BUCKET)
##
## 18-30 31-50 51-70 70+
## 400 5638 2105 9
table(insurance_training$TARGET_AMT_BUCKET)
##
## 0-1000 1001-5000 5001-10000 10000+
## 102 1267 629 155
By bucketizing AGE
into discrete categories, it makes
the variable easier to interpret and analyze. Similarly, bucketizing
TARGET_AMT
helps transform a continuous variable with
potentially high variation into manageable categories. This can help
with clearer reporting and analysis of trends.
First and to have a clear decision about the type of transformation based on the skewness of each variable:
# Check skewness for numeric variables
skew_values <- sapply(insurance_training[, c("AGE", "CAR_AGE", "TARGET_AMT", "KIDSDRIV", "HOMEKIDS")], skewness, na.rm = TRUE)
# View skewness values
print(skew_values)
## AGE CAR_AGE TARGET_AMT KIDSDRIV HOMEKIDS
## -0.02899961 0.29120232 8.70630337 3.35183743 1.34112709
Interpretations:
AGE
: -0.03 This value is close to 0, indicating that
the AGE variable is approximately normally distributed. No
transformation is needed.
CAR_AGE
: 0.30 The skewness of CAR_AGE is slightly
positive, but it is relatively close to 0, meaning it is only mildly
skewed. We may not need a transformation for this variable, as the
skewness is not severe.
TARGET_AMT
: 8.71 This is highly positively skewed,
with a skewness greater than 1. This suggests that TARGET_AMT has a long
right tail, which is typical for monetary data. A log
transformation would be helpful in normalizing this
variable.
KIDSDRIV
: 3.35 This has significant positive
skewness, but it’s not extreme. If you want to reduce the skewness, you
could consider a log transformation, but it might not be absolutely
necessary if the model can handle the skewness well.
HOMEKIDS
: 1.34 This value also indicates mild
positive skewness. Similar to CAR_AGE, no transformation is strictly
necessary, but a log transformation could slightly improve the
distribution, especially if we are aiming for perfect
normality.
Now, based on the skewness above, we only need to log-transform the
TARGET_AMT
, and the other two variables that have a slight
high skewness:
# Apply log transformation to TARGET_AMT amd the others
insurance_training$TARGET_AMT_LOG <- log(insurance_training$TARGET_AMT + 1)
insurance_training$KIDSDRIV_LOG <- log(insurance_training$KIDSDRIV + 1)
insurance_training$HOMEKIDS_LOG <- log(insurance_training$HOMEKIDS + 1)
Let’s check the skewness values after the transformations we performed above:
# Check skewness after applying the transformations
skew_values_after_transformation <- sapply(insurance_training[, c("AGE", "CAR_AGE", "TARGET_AMT_LOG", "KIDSDRIV_LOG", "HOMEKIDS_LOG")], skewness, na.rm = TRUE)
# View the skewness values after transformation
print(skew_values_after_transformation)
## AGE CAR_AGE TARGET_AMT_LOG KIDSDRIV_LOG HOMEKIDS_LOG
## -0.02899961 0.29120232 1.11518775 2.73381482 0.93255965
That is good progress;
The log transformation on TARGET_AMT
has reduced the
skewness significantly, but it remains moderately skewed. This is
typical for monetary variables. The transformation has improved the
distribution but could still benefit from further adjustments.
The transformation on KIDSDRIV
has reduced the
skewness but it is still quite positive. This suggests that the log
transformation helped, but the variable is still somewhat skewed. We
should consider another transformation.
The log transformation on HOMEKIDS
has reduced the
skewness to a more acceptable level, bringing it closer to zero. This
variable is now much more normally distributed and ready for
modeling.
One additional tranformation that can help us normalize the
continuous variable TARGET_AMT
is Box-Cox Transformation
insurance_training$TARGET_AMT_SHIFTED <- insurance_training$TARGET_AMT + 1
boxcox_result <- boxcox(TARGET_AMT_SHIFTED ~ 1, data = insurance_training)
lambda <- boxcox_result$x[which.max(boxcox_result$y)]
insurance_training$TARGET_AMT_BOXCOX <- (insurance_training$TARGET_AMT_SHIFTED^lambda - 1) / lambda
we can also perform the square root transformation:
insurance_training$TARGET_AMT_SQRT <- sqrt(insurance_training$TARGET_AMT)
Let’s do the same thing for the variable KIDSDRIV
:
First, Box-Cox:
insurance_training$KIDSDRIV_BOXCOX <- (insurance_training$KIDSDRIV + 1)^lambda - 1
Then, we can use Cube Root transformation:
insurance_training$KIDSDRIV_CUBE <- sign(insurance_training$KIDSDRIV) * abs(insurance_training$KIDSDRIV)^(1/3)
Let’s check once more for after-transformations-skewness
# Check skewness after applying the transformations
skew_values_after_transformation2 <- sapply(insurance_training[, c("AGE", "CAR_AGE", "TARGET_AMT_BOXCOX", "KIDSDRIV_BOXCOX", "HOMEKIDS_LOG")], skewness, na.rm = TRUE)
# View the skewness values after transformation
print(skew_values_after_transformation2)
## AGE CAR_AGE TARGET_AMT_BOXCOX KIDSDRIV_BOXCOX
## -0.02899961 0.29120232 1.07282678 -2.60368149
## HOMEKIDS_LOG
## 0.93255965
# Check skewness after applying the transformations
skew_values_after_transformation3 <- sapply(insurance_training[, c("AGE", "CAR_AGE", "TARGET_AMT_SQRT", "KIDSDRIV_CUBE", "HOMEKIDS_LOG")], skewness, na.rm = TRUE)
# View the skewness values after transformation
print(skew_values_after_transformation3)
## AGE CAR_AGE TARGET_AMT_SQRT KIDSDRIV_CUBE HOMEKIDS_LOG
## -0.02899961 0.29120232 2.34878526 2.43528069 0.93255965
Based on the transformations above:
TARGET_AMT
: Box-Cox was more effective in reducing
skewness compared to the square root or cube transformations. While for
KIDSDRIV
, Box-Cox made the variable more negatively skewed,
whereas cube transformation made it more positively skewed. Neither
transformation worked well. So we better keep the _CUBE or find another
approach for this variable.Age-based Grouping (AGE_GROUP): Age is a continuous variable, but for the purposes of analysis and modeling, grouping it into categories allows us to better understand trends in different age ranges. For example, it might be valuable to compare the behavior of individuals in their 20s versus those in their 50s when it comes to claims or risk.
# Create age groups
insurance_training$AGE_GROUP <- cut(insurance_training$AGE,
breaks = c(18, 30, 50, Inf),
labels = c("18-30", "31-50", "51+"))
Creating Ratio Variable (KIDSDRIV_RATIO): This gives us a relative measure of how many kids are driving in relation to the parent’s age. This might indicate a trend where younger parents might have fewer kids driving or older parents might have more kids in the driving age range. This may impact outcomes like insurance risk or claim amounts.
# Create a new variable as the ratio of KIDSDRIV to AGE
insurance_training$KIDSDRIV_RATIO <- insurance_training$KIDSDRIV / insurance_training$AGE
# Create a new variable as the ratio of HOMEKIDS to AGE
insurance_training$HOMEKIDS_RATIO <- insurance_training$HOMEKIDS / insurance_training$AGE
We are going to use the variables; AGE, CAR_AGE, KIDSDRIV_LOG, HOMEKIDS_LOG which are likely to impact the target variable. We use log-transformed TARGET_AMT to handle skewness.
# Multiple Linear Regression - Model 1 (using selected transformed variables)
model1 <- lm(TARGET_AMT_LOG ~ AGE + CAR_AGE + KIDSDRIV_LOG + HOMEKIDS_LOG, data = insurance_training)
summary(model1)
##
## Call:
## lm(formula = TARGET_AMT_LOG ~ AGE + CAR_AGE + KIDSDRIV_LOG +
## HOMEKIDS_LOG, data = insurance_training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.225 -2.273 -1.800 4.265 9.514
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.518424 0.263075 13.374 < 2e-16 ***
## AGE -0.025437 0.005407 -4.704 2.59e-06 ***
## CAR_AGE -0.049623 0.007399 -6.707 2.12e-11 ***
## KIDSDRIV_LOG 0.918823 0.161351 5.695 1.28e-08 ***
## HOMEKIDS_LOG 0.326423 0.097973 3.332 0.000867 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.623 on 8156 degrees of freedom
## Multiple R-squared: 0.02669, Adjusted R-squared: 0.02621
## F-statistic: 55.91 on 4 and 8156 DF, p-value: < 2.2e-16
The model above explains only about 2.67% of the variance in the target variable ( \(R^2=0.0267\)), which suggests that while the predictors are statistically significant, they do not account for much of the variability in the target variable.
The significant predictors are AGE, CAR_AGE, KIDSDRIV_LOG, and HOMEKIDS_LOG, with the strongest positive relationship seen in KIDSDRIV_LOG and HOMEKIDS_LOG, while AGE and CAR_AGE have negative relationships with the target variable.
In this model, we’ll use the log-transformed variables for better model stability, which should improve performance by addressing skewness in the data.
summary(insurance_training$AGE)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 16.00 39.00 45.00 44.79 51.00 81.00
summary(insurance_training$CAR_AGE)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.000 4.000 8.328 8.328 12.000 28.000
summary(insurance_training$KIDSDRIV)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1711 0.0000 4.0000
summary(insurance_training$HOMEKIDS)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.7212 1.0000 5.0000
any(is.na(insurance_training$AGE))
## [1] FALSE
any(is.na(insurance_training$CAR_AGE))
## [1] FALSE
any(is.na(insurance_training$KIDSDRIV))
## [1] FALSE
any(is.na(insurance_training$HOMEKIDS))
## [1] FALSE
# Multiple Linear Regression - Model 2 (using log-transformed variables)
model2 <- lm(TARGET_AMT_LOG ~ AGE + CAR_AGE + KIDSDRIV_LOG + HOMEKIDS_LOG,
data = insurance_training)
summary(model2)
##
## Call:
## lm(formula = TARGET_AMT_LOG ~ AGE + CAR_AGE + KIDSDRIV_LOG +
## HOMEKIDS_LOG, data = insurance_training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.225 -2.273 -1.800 4.265 9.514
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.518424 0.263075 13.374 < 2e-16 ***
## AGE -0.025437 0.005407 -4.704 2.59e-06 ***
## CAR_AGE -0.049623 0.007399 -6.707 2.12e-11 ***
## KIDSDRIV_LOG 0.918823 0.161351 5.695 1.28e-08 ***
## HOMEKIDS_LOG 0.326423 0.097973 3.332 0.000867 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.623 on 8156 degrees of freedom
## Multiple R-squared: 0.02669, Adjusted R-squared: 0.02621
## F-statistic: 55.91 on 4 and 8156 DF, p-value: < 2.2e-16
The individual predictors (AGE, CAR_AGE, KIDSDRIV_LOG, and HOMEKIDS_LOG) are statistically significant and have the expected signs in terms of their effect on the target variable (TARGET_AMT_LOG).
However, the model fit is weak (with a low R-squared of 0.02669), indicating that these predictors alone do not explain much of the variability in the target variable. There could be other variables or interactions that are not accounted for, or the relationship between predictors and the target may not be linear.
We introduce interaction terms between variables to explore the combined effects of variables on the target.
# Multiple Linear Regression - Model 3 (including interaction terms)
model3 <- lm(TARGET_AMT_LOG ~ AGE * CAR_AGE + KIDSDRIV_LOG * HOMEKIDS_LOG, data = insurance_training)
summary(model3)
##
## Call:
## lm(formula = TARGET_AMT_LOG ~ AGE * CAR_AGE + KIDSDRIV_LOG *
## HOMEKIDS_LOG, data = insurance_training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.249 -2.248 -1.770 4.240 9.559
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.9281839 0.4160701 9.441 < 2e-16 ***
## AGE -0.0347230 0.0089855 -3.864 0.000112 ***
## CAR_AGE -0.1027284 0.0397057 -2.587 0.009692 **
## KIDSDRIV_LOG 1.4434020 0.4781505 3.019 0.002546 **
## HOMEKIDS_LOG 0.3354303 0.0998403 3.360 0.000784 ***
## AGE:CAR_AGE 0.0011640 0.0008567 1.359 0.174262
## KIDSDRIV_LOG:HOMEKIDS_LOG -0.4573381 0.4018289 -1.138 0.255095
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.623 on 8154 degrees of freedom
## Multiple R-squared: 0.02707, Adjusted R-squared: 0.02635
## F-statistic: 37.81 on 6 and 8154 DF, p-value: < 2.2e-16
Adding interaction terms did not substantially improve the model fit. The main effects (AGE, CAR_AGE, KIDSDRIV_LOG, and HOMEKIDS_LOG) remain statistically significant, but the interaction terms are not, suggesting that these interactions do not improve the model’s ability to predict TARGET_AMT_LOG.
Model fit is still weak with low R-squared values, meaning the model is not explaining much of the variability in the target. Further steps could include adding additional predictors or exploring non-linear relationships, or possibly using more advanced models such as random forests or boosting.
Binary logistic regression models predict a binary outcome (0 or 1). We’ll predict the TARGET_FLAG (whether a claim happened: 0 = no, 1 = yes).
We’ll start by using some of the transformed variables that are more normally distributed, as they were transformed to reduce skewness and stabilize the model.
# Logistic Regression - Model 1
log_model1 <- glm(TARGET_FLAG ~ AGE + CAR_AGE + KIDSDRIV_LOG + HOMEKIDS_LOG,
family = binomial(link = "logit"), data = insurance_training)
summary(log_model1)
##
## Call:
## glm(formula = TARGET_FLAG ~ AGE + CAR_AGE + KIDSDRIV_LOG + HOMEKIDS_LOG,
## family = binomial(link = "logit"), data = insurance_training)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.143951 0.166943 -0.862 0.38853
## AGE -0.017073 0.003469 -4.922 8.58e-07 ***
## CAR_AGE -0.032706 0.004799 -6.815 9.40e-12 ***
## KIDSDRIV_LOG 0.519439 0.093623 5.548 2.89e-08 ***
## HOMEKIDS_LOG 0.186831 0.060143 3.106 0.00189 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9418.0 on 8160 degrees of freedom
## Residual deviance: 9198.1 on 8156 degrees of freedom
## AIC: 9208.1
##
## Number of Fisher Scoring iterations: 4
# Check multicollinearity using VIF
vif(log_model1)
## AGE CAR_AGE KIDSDRIV_LOG HOMEKIDS_LOG
## 1.362228 1.029894 1.380320 1.771764
AGE and CAR_AGE both have a negative relationship with the probability of TARGET_FLAG = 1, meaning as age and car age increase, the likelihood of the target outcome decreases.
KIDSDRIV_LOG and HOMEKIDS_LOG both have positive relationships with the target outcome, meaning that as these variables increase, the likelihood of TARGET_FLAG = 1 increases.
The model’s fit is acceptable, but there is room for improvement, as indicated by the residual deviance and AIC.
# Apply log transformation to variables in the evaluation dataset
insurance_evaluation$KIDSDRIV_LOG <- log(insurance_evaluation$KIDSDRIV + 1)
insurance_evaluation$HOMEKIDS_LOG <- log(insurance_evaluation$HOMEKIDS + 1)
- Evaluating the model:
# Residual diagnostics
par(mfrow = c(2, 2))
plot(log_model1)
# Predict on training and evaluation datasets
insurance_training$PRED_TARGET_AMT <- exp(predict(log_model1, newdata = insurance_training)) - 1
insurance_evaluation$PRED_TARGET_AMT <- exp(predict(log_model1, newdata = insurance_evaluation)) - 1
We include interaction terms to explore the effect of variable combinations on the target variable.
# Logistic Regression - Model 3 (including interaction terms + KIDSDRIV_RATIO)
log_model2 <- glm(TARGET_FLAG ~ AGE * CAR_AGE + KIDSDRIV_RATIO + HOMEKIDS_LOG,
family = binomial(link = "logit"), data = insurance_training)
summary(log_model2)
##
## Call:
## glm(formula = TARGET_FLAG ~ AGE * CAR_AGE + KIDSDRIV_RATIO +
## HOMEKIDS_LOG, family = binomial(link = "logit"), data = insurance_training)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.0820063 0.2532847 -0.324 0.746111
## AGE -0.0185297 0.0055483 -3.340 0.000839 ***
## CAR_AGE -0.0475421 0.0254602 -1.867 0.061858 .
## KIDSDRIV_RATIO 10.7397294 2.1750050 4.938 7.9e-07 ***
## HOMEKIDS_LOG 0.2149380 0.0590505 3.640 0.000273 ***
## AGE:CAR_AGE 0.0003346 0.0005585 0.599 0.549066
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9418.0 on 8160 degrees of freedom
## Residual deviance: 9204.3 on 8155 degrees of freedom
## AIC: 9216.3
##
## Number of Fisher Scoring iterations: 4
The significant predictors in this model are AGE, KIDSDRIV_RATIO, and HOMEKIDS_LOG, indicating they are important in predicting the outcome (TARGET_FLAG).
The model is not greatly improved by the interaction term (AGE:CAR_AGE), suggesting that there is no strong interaction effect between AGE and CAR_AGE.
The CAR_AGE predictor is marginally significant, suggesting a potential relationship, but it is not as strong as the other variables.
# Logistic Regression - Model 3 (Including Interaction Terms + KIDSDRIV_RATIO)
log_model3 <- glm(TARGET_FLAG ~ AGE * CAR_AGE + KIDSDRIV_RATIO + HOMEKIDS_LOG,
family = binomial(link = "logit"), data = insurance_training)
summary(log_model3)
##
## Call:
## glm(formula = TARGET_FLAG ~ AGE * CAR_AGE + KIDSDRIV_RATIO +
## HOMEKIDS_LOG, family = binomial(link = "logit"), data = insurance_training)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.0820063 0.2532847 -0.324 0.746111
## AGE -0.0185297 0.0055483 -3.340 0.000839 ***
## CAR_AGE -0.0475421 0.0254602 -1.867 0.061858 .
## KIDSDRIV_RATIO 10.7397294 2.1750050 4.938 7.9e-07 ***
## HOMEKIDS_LOG 0.2149380 0.0590505 3.640 0.000273 ***
## AGE:CAR_AGE 0.0003346 0.0005585 0.599 0.549066
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9418.0 on 8160 degrees of freedom
## Residual deviance: 9204.3 on 8155 degrees of freedom
## AIC: 9216.3
##
## Number of Fisher Scoring iterations: 4
AGE and KIDSDRIV_RATIO are the strongest predictors, with KIDSDRIV_RATIO having a particularly large effect on the outcome.
CAR_AGE has a weaker, marginally significant effect, while HOMEKIDS_LOG also contributes significantly to the model.
The interaction between AGE and CAR_AGE does not significantly improve the model.
In this section, we will evaluate the multiple linear regression and binary logistic regression models using various criteria. The goal is to select the models that provide the best balance between performance and interpretability, while also considering the business context and model simplicity. Here, we will explain the criteria used to select the best models, address potential issues such as multi-collinearity, and discuss the relevant model outputs.
The key objective for the multiple linear regression model is to find the best model that explains the variability in the target variable (TARGET_AMT_LOG).
Let’s extract Coefficients and Standard Errors:
# Model Evaluation for Multiple Linear Regression - Model 1
# Check for multicollinearity (VIF)
vif(model1) # Variance Inflation Factor (VIF)
## AGE CAR_AGE KIDSDRIV_LOG HOMEKIDS_LOG
## 1.352025 1.036944 1.359481 1.746139
# Calculate R-squared, Adjusted R-squared, RMSE, and F-statistic
summary(model1)
##
## Call:
## lm(formula = TARGET_AMT_LOG ~ AGE + CAR_AGE + KIDSDRIV_LOG +
## HOMEKIDS_LOG, data = insurance_training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.225 -2.273 -1.800 4.265 9.514
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.518424 0.263075 13.374 < 2e-16 ***
## AGE -0.025437 0.005407 -4.704 2.59e-06 ***
## CAR_AGE -0.049623 0.007399 -6.707 2.12e-11 ***
## KIDSDRIV_LOG 0.918823 0.161351 5.695 1.28e-08 ***
## HOMEKIDS_LOG 0.326423 0.097973 3.332 0.000867 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.623 on 8156 degrees of freedom
## Multiple R-squared: 0.02669, Adjusted R-squared: 0.02621
## F-statistic: 55.91 on 4 and 8156 DF, p-value: < 2.2e-16
# Plot residuals
par(mfrow = c(2, 2))
plot(model1)
# RMSE Calculation
rmse_model1 <- sqrt(mean(model1$residuals^2))
# Display results
cat("Adjusted R^2: ", summary(model1)$adj.r.squared, "\n")
## Adjusted R^2: 0.02621179
cat("RMSE: ", rmse_model1, "\n")
## RMSE: 3.621839
cat("F-statistic: ", summary(model1)$fstatistic[1], "\n")
## F-statistic: 55.91137
The model appears to have statistically significant predictors (with very low p-values), but the overall fit is poor as indicated by the low R-squared and adjusted R-squared values. This suggests that while individual predictors like age, car age, and home kids may have a significant relationship with the target variable, the model is not explaining much of the variability in the target variable. Further model refinement or additional predictors may be necessary for a better fit.
# Linear Models
coeff_model1 <- summary(model1)$coefficients
coeff_model2 <- summary(model2)$coefficients
coeff_model3 <- summary(model3)$coefficients
# Logistic Models
coeff_log_model1 <- summary(log_model1)$coefficients
coeff_log_model2 <- summary(log_model2)$coefficients
coeff_log_model3 <- summary(log_model3)$coefficients
# Display coefficients
print("Linear Model 1 Coefficients:")
## [1] "Linear Model 1 Coefficients:"
coeff_model1
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.51842381 0.263075364 13.374205 2.273907e-40
## AGE -0.02543688 0.005407291 -4.704182 2.590791e-06
## CAR_AGE -0.04962289 0.007399082 -6.706627 2.124461e-11
## KIDSDRIV_LOG 0.91882323 0.161351303 5.694551 1.279887e-08
## HOMEKIDS_LOG 0.32642278 0.097973128 3.331758 8.668289e-04
print("Linear Model 2 Coefficients:")
## [1] "Linear Model 2 Coefficients:"
coeff_model2
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.51842381 0.263075364 13.374205 2.273907e-40
## AGE -0.02543688 0.005407291 -4.704182 2.590791e-06
## CAR_AGE -0.04962289 0.007399082 -6.706627 2.124461e-11
## KIDSDRIV_LOG 0.91882323 0.161351303 5.694551 1.279887e-08
## HOMEKIDS_LOG 0.32642278 0.097973128 3.331758 8.668289e-04
print("Linear Model 3 Coefficients:")
## [1] "Linear Model 3 Coefficients:"
coeff_model3
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.928183890 0.4160700703 9.441159 4.720873e-21
## AGE -0.034722973 0.0089855105 -3.864329 1.122585e-04
## CAR_AGE -0.102728387 0.0397057240 -2.587244 9.691855e-03
## KIDSDRIV_LOG 1.443402034 0.4781504595 3.018719 2.546304e-03
## HOMEKIDS_LOG 0.335430254 0.0998403333 3.359667 7.839450e-04
## AGE:CAR_AGE 0.001164017 0.0008566794 1.358755 1.742619e-01
## KIDSDRIV_LOG:HOMEKIDS_LOG -0.457338073 0.4018288879 -1.138141 2.550949e-01
print("Logistic Model 1 Coefficients:")
## [1] "Logistic Model 1 Coefficients:"
coeff_log_model1
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.14395111 0.166942609 -0.862279 3.885340e-01
## AGE -0.01707333 0.003468944 -4.921766 8.576694e-07
## CAR_AGE -0.03270645 0.004798914 -6.815385 9.401145e-12
## KIDSDRIV_LOG 0.51943937 0.093623332 5.548183 2.886538e-08
## HOMEKIDS_LOG 0.18683124 0.060143125 3.106444 1.893523e-03
print("Logistic Model 2 Coefficients:")
## [1] "Logistic Model 2 Coefficients:"
coeff_log_model2
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.0820063355 0.2532846535 -0.3237714 7.461111e-01
## AGE -0.0185296892 0.0055483452 -3.3396785 8.387542e-04
## CAR_AGE -0.0475420986 0.0254601629 -1.8673132 6.185786e-02
## KIDSDRIV_RATIO 10.7397293775 2.1750050091 4.9377952 7.901074e-07
## HOMEKIDS_LOG 0.2149379711 0.0590505149 3.6399000 2.727439e-04
## AGE:CAR_AGE 0.0003346359 0.0005585082 0.5991602 5.490660e-01
# Linear Models
aic_model1 <- AIC(model1)
aic_model2 <- AIC(model2) # Will be same as model1
aic_model3 <- AIC(model3)
adjusted_r2_model1 <- summary(model1)$adj.r.squared
adjusted_r2_model2 <- summary(model2)$adj.r.squared
adjusted_r2_model3 <- summary(model3)$adj.r.squared
# Logistic Models
aic_log_model1 <- AIC(log_model1)
aic_log_model2 <- AIC(log_model2)
aic_log_model3 <- AIC(log_model3)
# Display results
cat("Linear Models AIC and Adjusted R²:\n")
## Linear Models AIC and Adjusted R²:
cat("Model 1: AIC =", aic_model1, "Adjusted R² =", adjusted_r2_model1, "\n")
## Model 1: AIC = 44178.03 Adjusted R² = 0.02621179
cat("Model 2: AIC =", aic_model2, "Adjusted R² =", adjusted_r2_model2, "\n")
## Model 2: AIC = 44178.03 Adjusted R² = 0.02621179
cat("Model 3: AIC =", aic_model3, "Adjusted R² =", adjusted_r2_model3, "\n")
## Model 3: AIC = 44178.84 Adjusted R² = 0.02635403
cat("\nLogistic Models AIC:\n")
##
## Logistic Models AIC:
cat("Model 1: AIC =", aic_log_model1, "\n")
## Model 1: AIC = 9208.064
cat("Model 2: AIC =", aic_log_model2, "\n")
## Model 2: AIC = 9216.267
cat("Model 3: AIC =", aic_log_model3, "\n")
## Model 3: AIC = 9216.267
Linear Regression Models
Both models are identical, as reflected by the same coefficients, AIC, and Adjusted R² values. AIC: 44178.03 Adjusted R²: 0.0262
Model 3: Adds interaction terms (AGE:CAR_AGE and KIDSDRIV_LOG:HOMEKIDS_LOG). Slightly higher Adjusted R² (0.0264) compared to Models 1 and 2. Higher AIC (44178.84), suggesting Model 3 doesn’t perform better overall.
Decision for Linear Models:
Model 1 or Model 2 is preferred due to lower AIC, simpler structure, and comparable Adjusted R².
Logistic Regression Models
Significant predictors: AGE, CAR_AGE, KIDSDRIV_LOG, HOMEKIDS_LOG (p-values < 0.05).
Significant predictors: AGE, KIDSDRIV_RATIO, and HOMEKIDS_LOG.
Interaction term AGE:CAR_AGE is not significant (p = 0.549), indicating no meaningful contribution.
Decision for Logistic Models:
Model 1 is preferred due to lower AIC and a more parsimonious structure.
So based on the above metrics and comparison, our final model selection is: Model1 for both linear regression and logistic regression.
Let’s generate the ROC Curves for better decision:
# Predict probabilities on the training dataset
probabilities <- predict(log_model1, newdata = insurance_training, type = "response")
# Calculate the ROC curve
roc_curve <- roc(insurance_training$TARGET_FLAG, probabilities)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Plot the ROC curve
plot(roc_curve, col = "blue", lwd = 2,
main = "Corrected ROC Curve for Logistic Model 1",
xlab = "False Positive Rate", ylab = "True Positive Rate",
xlim = c(0, 1), ylim = c(0, 1)) # Ensure proper axis limits
abline(a = 0, b = 1, lty = 2, col = "red") # Add diagonal line
# Display the AUC
auc(roc_curve)
## Area under the curve: 0.6095
# Predict probabilities on the training dataset
probabilities2 <- predict(log_model2, newdata = insurance_training, type = "response")
# Calculate the ROC curve
roc_curve2 <- roc(insurance_training$TARGET_FLAG, probabilities2)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Plot the ROC curve
plot(roc_curve2, col = "blue", lwd = 2,
main = "Corrected ROC Curve for Logistic Model 2",
xlab = "False Positive Rate", ylab = "True Positive Rate",
xlim = c(0, 1), ylim = c(0, 1)) # Ensure proper axis limits
abline(a = 0, b = 1, lty = 2, col = "red") # Add diagonal line
# Display the AUC
auc(roc_curve2)
## Area under the curve: 0.6072
# Predict probabilities on the training dataset
probabilities3 <- predict(log_model3, newdata = insurance_training, type = "response")
# Calculate the ROC curve
roc_curve3 <- roc(insurance_training$TARGET_FLAG, probabilities3)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Plot the ROC curve
plot(roc_curve3, col = "blue", lwd = 2,
main = "Corrected ROC Curve for Logistic Model 3",
xlab = "False Positive Rate", ylab = "True Positive Rate",
xlim = c(0, 1), ylim = c(0, 1)) # Ensure proper axis limits
abline(a = 0, b = 1, lty = 2, col = "red") # Add diagonal line
# Display the AUC
auc(roc_curve3)
## Area under the curve: 0.6072