Loading Necessary Libraries and the data

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(caret)
## Loading required package: lattice
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(e1071)
## Warning: package 'e1071' was built under R version 4.4.1
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
insurance_training <- read.csv("insurance_training_data.csv")
insurance_evaluation <- read.csv("insurance-evaluation-data.csv")

1. Data Exploration:

a. Mean / Standard Deviation / Median

# Quick exploration
str(insurance_training)  # Structure of data
## 'data.frame':    8161 obs. of  26 variables:
##  $ INDEX      : int  1 2 4 5 6 7 8 11 12 13 ...
##  $ TARGET_FLAG: int  0 0 0 0 0 1 0 1 1 0 ...
##  $ TARGET_AMT : num  0 0 0 0 0 ...
##  $ KIDSDRIV   : int  0 0 0 0 0 0 0 1 0 0 ...
##  $ AGE        : int  60 43 35 51 50 34 54 37 34 50 ...
##  $ HOMEKIDS   : int  0 0 1 0 0 1 0 2 0 0 ...
##  $ YOJ        : int  11 11 10 14 NA 12 NA NA 10 7 ...
##  $ INCOME     : chr  "$67,349" "$91,449" "$16,039" "" ...
##  $ PARENT1    : chr  "No" "No" "No" "No" ...
##  $ HOME_VAL   : chr  "$0" "$257,252" "$124,191" "$306,251" ...
##  $ MSTATUS    : chr  "z_No" "z_No" "Yes" "Yes" ...
##  $ SEX        : chr  "M" "M" "z_F" "M" ...
##  $ EDUCATION  : chr  "PhD" "z_High School" "z_High School" "<High School" ...
##  $ JOB        : chr  "Professional" "z_Blue Collar" "Clerical" "z_Blue Collar" ...
##  $ TRAVTIME   : int  14 22 5 32 36 46 33 44 34 48 ...
##  $ CAR_USE    : chr  "Private" "Commercial" "Private" "Private" ...
##  $ BLUEBOOK   : chr  "$14,230" "$14,940" "$4,010" "$15,440" ...
##  $ TIF        : int  11 1 4 7 1 1 1 1 1 7 ...
##  $ CAR_TYPE   : chr  "Minivan" "Minivan" "z_SUV" "Minivan" ...
##  $ RED_CAR    : chr  "yes" "yes" "no" "yes" ...
##  $ OLDCLAIM   : chr  "$4,461" "$0" "$38,690" "$0" ...
##  $ CLM_FREQ   : int  2 0 2 0 2 0 0 1 0 0 ...
##  $ REVOKED    : chr  "No" "No" "No" "No" ...
##  $ MVR_PTS    : int  3 0 3 0 3 0 0 10 0 1 ...
##  $ CAR_AGE    : int  18 1 10 6 17 7 1 7 1 17 ...
##  $ URBANICITY : chr  "Highly Urban/ Urban" "Highly Urban/ Urban" "Highly Urban/ Urban" "Highly Urban/ Urban" ...
summary(insurance_training)  # Summary statistics
##      INDEX        TARGET_FLAG       TARGET_AMT        KIDSDRIV     
##  Min.   :    1   Min.   :0.0000   Min.   :     0   Min.   :0.0000  
##  1st Qu.: 2559   1st Qu.:0.0000   1st Qu.:     0   1st Qu.:0.0000  
##  Median : 5133   Median :0.0000   Median :     0   Median :0.0000  
##  Mean   : 5152   Mean   :0.2638   Mean   :  1504   Mean   :0.1711  
##  3rd Qu.: 7745   3rd Qu.:1.0000   3rd Qu.:  1036   3rd Qu.:0.0000  
##  Max.   :10302   Max.   :1.0000   Max.   :107586   Max.   :4.0000  
##                                                                    
##       AGE           HOMEKIDS           YOJ          INCOME         
##  Min.   :16.00   Min.   :0.0000   Min.   : 0.0   Length:8161       
##  1st Qu.:39.00   1st Qu.:0.0000   1st Qu.: 9.0   Class :character  
##  Median :45.00   Median :0.0000   Median :11.0   Mode  :character  
##  Mean   :44.79   Mean   :0.7212   Mean   :10.5                     
##  3rd Qu.:51.00   3rd Qu.:1.0000   3rd Qu.:13.0                     
##  Max.   :81.00   Max.   :5.0000   Max.   :23.0                     
##  NA's   :6                        NA's   :454                      
##    PARENT1            HOME_VAL           MSTATUS              SEX           
##  Length:8161        Length:8161        Length:8161        Length:8161       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   EDUCATION             JOB               TRAVTIME        CAR_USE         
##  Length:8161        Length:8161        Min.   :  5.00   Length:8161       
##  Class :character   Class :character   1st Qu.: 22.00   Class :character  
##  Mode  :character   Mode  :character   Median : 33.00   Mode  :character  
##                                        Mean   : 33.49                     
##                                        3rd Qu.: 44.00                     
##                                        Max.   :142.00                     
##                                                                           
##    BLUEBOOK              TIF           CAR_TYPE           RED_CAR         
##  Length:8161        Min.   : 1.000   Length:8161        Length:8161       
##  Class :character   1st Qu.: 1.000   Class :character   Class :character  
##  Mode  :character   Median : 4.000   Mode  :character   Mode  :character  
##                     Mean   : 5.351                                        
##                     3rd Qu.: 7.000                                        
##                     Max.   :25.000                                        
##                                                                           
##    OLDCLAIM            CLM_FREQ        REVOKED             MVR_PTS      
##  Length:8161        Min.   :0.0000   Length:8161        Min.   : 0.000  
##  Class :character   1st Qu.:0.0000   Class :character   1st Qu.: 0.000  
##  Mode  :character   Median :0.0000   Mode  :character   Median : 1.000  
##                     Mean   :0.7986                      Mean   : 1.696  
##                     3rd Qu.:2.0000                      3rd Qu.: 3.000  
##                     Max.   :5.0000                      Max.   :13.000  
##                                                                         
##     CAR_AGE        URBANICITY       
##  Min.   :-3.000   Length:8161       
##  1st Qu.: 1.000   Class :character  
##  Median : 8.000   Mode  :character  
##  Mean   : 8.328                     
##  3rd Qu.:12.000                     
##  Max.   :28.000                     
##  NA's   :510
# Check for missing values
colSums(is.na(insurance_training))
##       INDEX TARGET_FLAG  TARGET_AMT    KIDSDRIV         AGE    HOMEKIDS 
##           0           0           0           0           6           0 
##         YOJ      INCOME     PARENT1    HOME_VAL     MSTATUS         SEX 
##         454           0           0           0           0           0 
##   EDUCATION         JOB    TRAVTIME     CAR_USE    BLUEBOOK         TIF 
##           0           0           0           0           0           0 
##    CAR_TYPE     RED_CAR    OLDCLAIM    CLM_FREQ     REVOKED     MVR_PTS 
##           0           0           0           0           0           0 
##     CAR_AGE  URBANICITY 
##         510           0

The data has 8161 observations and 25 variables (excluding the INDEX which won’t be used for the analysis).

The primary target variable is TARGET_FLAG, a binary indicator representing whether a car was in crash, and the secondary target TARGET_AMT indicates the amount of the cost if a car was in crash.

AGE has a mean of 44.8 years (SD = 14.3) with a median age of 45, indicating a balanced age distribution. TRAVTIME (commute time to work) averages 33.5 minutes, with most values clustered between 22 and 44 minutes. A full table of key statistics is included above for reference.

Several variables have missing values:

  • AGE (6 missing values), YOJ (454), INCOME (many blanks), and CAR_AGE (510). We are going to apply imputation strategies to address these gaps. Missing AGE values will be replaced with the median (45 years).

  • YOJ and CAR_AGE will be imputed using their median values (11 and 8 years, respectively). INCOME, recorded as character strings, will be cleaned and converted to numeric, with missing values replaced by the median.

b. Bar Chart or Box Plot of the data

b.1. Visualize Numeric Variables

# Histogram for AGE
ggplot(insurance_training, aes(x = AGE)) +
  geom_histogram(bins = 30, fill = "skyblue", color = "black") +
  labs(title = "Age Distribution", x = "Age", y = "Frequency")
## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Boxplot for CAR_AGE
ggplot(insurance_training, aes(y = CAR_AGE)) +
  geom_boxplot(fill = "lightgreen") +
  labs(title = "Boxplot of CAR_AGE", y = "Car Age")
## Warning: Removed 510 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

# Remove rows with missing values
numeric_data <- na.omit(insurance_training[, sapply(insurance_training, is.numeric)])

# Checking correlation between numeric features
numeric_vars <- sapply(insurance_training, is.numeric)
correlation_matrix <- cor(insurance_training[, numeric_vars], use = "pairwise.complete.obs")
print(correlation_matrix)
##                     INDEX  TARGET_FLAG    TARGET_AMT     KIDSDRIV           AGE
## INDEX        1.0000000000 -0.001669645 -0.0005934765  0.015575660  3.384609e-02
## TARGET_FLAG -0.0016696445  1.000000000  0.5342460609  0.103668296 -1.032167e-01
## TARGET_AMT  -0.0005934765  0.534246061  1.0000000000  0.055394177 -4.172832e-02
## KIDSDRIV     0.0155756605  0.103668296  0.0553941768  1.000000000 -7.517883e-02
## AGE          0.0338460913 -0.103216708 -0.0417283235 -0.075178828  1.000000e+00
## HOMEKIDS     0.0000521436  0.115621011  0.0619880434  0.464015239 -4.454410e-01
## YOJ          0.0267332475 -0.070511825 -0.0220851981  0.043304800  1.360725e-01
## TRAVTIME    -0.0230701278  0.048368310  0.0279870160  0.008447299  5.269488e-03
## TIF         -0.0092139808 -0.082370050 -0.0464808306 -0.001988715 -6.631285e-05
## CLM_FREQ     0.0187802762  0.216196061  0.1164191586  0.037062929 -2.409232e-02
## MVR_PTS      0.0078825302  0.219197054  0.1378655086  0.053566373 -7.157543e-02
## CAR_AGE     -0.0006986235 -0.100650615 -0.0588221106 -0.053993001  1.762208e-01
##                  HOMEKIDS         YOJ     TRAVTIME           TIF     CLM_FREQ
## INDEX        0.0000521436  0.02673325 -0.023070128 -9.213981e-03  0.018780276
## TARGET_FLAG  0.1156210106 -0.07051183  0.048368310 -8.237005e-02  0.216196061
## TARGET_AMT   0.0619880434 -0.02208520  0.027987016 -4.648083e-02  0.116419159
## KIDSDRIV     0.4640152389  0.04330480  0.008447299 -1.988715e-03  0.037062929
## AGE         -0.4454410402  0.13607248  0.005269488 -6.631285e-05 -0.024092321
## HOMEKIDS     1.0000000000  0.08682902 -0.007245604  1.181332e-02  0.029349289
## YOJ          0.0868290246  1.00000000 -0.016945311  2.478659e-02 -0.026308028
## TRAVTIME    -0.0072456039 -0.01694531  1.000000000 -1.160463e-02  0.006560211
## TIF          0.0118133187  0.02478659 -0.011604626  1.000000e+00 -0.023022955
## CLM_FREQ     0.0293492894 -0.02630803  0.006560211 -2.302295e-02  1.000000000
## MVR_PTS      0.0606013438 -0.03785508  0.010598511 -4.104573e-02  0.396638373
## CAR_AGE     -0.1521463981  0.06140648 -0.038232806  7.767352e-03 -0.009318765
##                 MVR_PTS       CAR_AGE
## INDEX        0.00788253 -0.0006986235
## TARGET_FLAG  0.21919705 -0.1006506150
## TARGET_AMT   0.13786551 -0.0588221106
## KIDSDRIV     0.05356637 -0.0539930013
## AGE         -0.07157543  0.1762208228
## HOMEKIDS     0.06060134 -0.1521463981
## YOJ         -0.03785508  0.0614064819
## TRAVTIME     0.01059851 -0.0382328060
## TIF         -0.04104573  0.0077673523
## CLM_FREQ     0.39663837 -0.0093187652
## MVR_PTS      1.00000000 -0.0199040729
## CAR_AGE     -0.01990407  1.0000000000

The histogram for the variable AGE shows an approximately normal distribution, with the highest frequency occurring around the median age of 45.

For CAR_AGE, outliers were evident, with the minimum value being -4, indicating that a negtive age doesn’t make sense.

A correlation matrix highlighted several key relationships:

  • The target variable TARGET_FLAG has a moderate positive correlation with MVR_PTS (Motor Vehicle Record Points, 0.22) and CLM_FREQ (Claim Frequency, 0.22). These variables are strong candidates for inclusion in the logistic regression model.

  • AGE and CAR_AGE exhibit weak negative correlations with TARGET_FLAG (-0.10 each), suggesting limited predictive value.

  • TARGET_AMT is moderately correlated with TARGET_FLAG (0.53), as expected, since claim amounts/cost depend on whether a claim was filed for a crashed car.

b.2. Visualize Categorical Variables:

# Bar plot for CAR_TYPE
ggplot(insurance_training, aes(x = CAR_TYPE)) +
  geom_bar(fill = "purple") +
  labs(title = "Car Type Distribution", x = "Car Type", y = "Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Bar plot for TARGET_FLAG
ggplot(insurance_training, aes(x = factor(TARGET_FLAG))) +
  geom_bar(fill = "orange") +
  labs(title = "Target Flag Distribution", x = "Target Flag (0 = No Crash, 1 = Crash)", y = "Count")

Vehicle Types: The dataset includes a variety of vehicle categories, with the most common being SUVs (Z_SUV), accounting for 2,260 entries, followed by:

  • Minivans: 2,020 entries

  • Pickup Trucks: 1,375 entries

  • Sports Cars: 875 entries

  • Vans: 750 entries

  • Panel Trucks: 700 entries

This distribution highlights a predominance of family-oriented and utility vehicles, potentially influencing claim tendencies.

Target Flag Distribution: The target variable TARGET_FLAG has a highly imbalanced distribution:

  • No Crash (0): 6,000 instances (approximately 73.5% of the data).

  • Crash (1): 2,100 instances (approximately 26.5%).

The imbalance will be considered in model development, potentially requiring techniques like weighting, oversampling, or undersampling to ensure accurate prediction.

# identify missing values
colSums(is.na(insurance_training))
##       INDEX TARGET_FLAG  TARGET_AMT    KIDSDRIV         AGE    HOMEKIDS 
##           0           0           0           0           6           0 
##         YOJ      INCOME     PARENT1    HOME_VAL     MSTATUS         SEX 
##         454           0           0           0           0           0 
##   EDUCATION         JOB    TRAVTIME     CAR_USE    BLUEBOOK         TIF 
##           0           0           0           0           0           0 
##    CAR_TYPE     RED_CAR    OLDCLAIM    CLM_FREQ     REVOKED     MVR_PTS 
##           0           0           0           0           0           0 
##     CAR_AGE  URBANICITY 
##         510           0

2. Data Preparation:

a. Handling Missing Values:

Since the variables that have missing values are numeric, we are going to impute the missing values using the mean.

# Impute missing values for numerical variables with mean
numeric_vars <- sapply(insurance_training, is.numeric)
insurance_training[numeric_vars] <- lapply(insurance_training[numeric_vars], function(x) ifelse(is.na(x), mean(x, na.rm = TRUE), x))

b. Creating Flags for Missing Values:

# Loop through all variables to create flags for missing values
for (var in colnames(insurance_training)) {
  insurance_training[paste0(var, "_FLAG")] <- ifelse(is.na(insurance_training[[var]]), 1, 0)
}

# Check the new flags columns
head(insurance_training)
##   INDEX TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS      YOJ   INCOME PARENT1
## 1     1           0          0        0  60        0 11.00000  $67,349      No
## 2     2           0          0        0  43        0 11.00000  $91,449      No
## 3     4           0          0        0  35        1 10.00000  $16,039      No
## 4     5           0          0        0  51        0 14.00000               No
## 5     6           0          0        0  50        0 10.49929 $114,986      No
## 6     7           1       2946        0  34        1 12.00000 $125,301     Yes
##   HOME_VAL MSTATUS SEX     EDUCATION           JOB TRAVTIME    CAR_USE BLUEBOOK
## 1       $0    z_No   M           PhD  Professional       14    Private  $14,230
## 2 $257,252    z_No   M z_High School z_Blue Collar       22 Commercial  $14,940
## 3 $124,191     Yes z_F z_High School      Clerical        5    Private   $4,010
## 4 $306,251     Yes   M  <High School z_Blue Collar       32    Private  $15,440
## 5 $243,925     Yes z_F           PhD        Doctor       36    Private  $18,000
## 6       $0    z_No z_F     Bachelors z_Blue Collar       46 Commercial  $17,430
##   TIF   CAR_TYPE RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS CAR_AGE
## 1  11    Minivan     yes   $4,461        2      No       3      18
## 2   1    Minivan     yes       $0        0      No       0       1
## 3   4      z_SUV      no  $38,690        2      No       3      10
## 4   7    Minivan     yes       $0        0      No       0       6
## 5   1      z_SUV      no  $19,217        2     Yes       3      17
## 6   1 Sports Car      no       $0        0      No       0       7
##            URBANICITY INDEX_FLAG TARGET_FLAG_FLAG TARGET_AMT_FLAG KIDSDRIV_FLAG
## 1 Highly Urban/ Urban          0                0               0             0
## 2 Highly Urban/ Urban          0                0               0             0
## 3 Highly Urban/ Urban          0                0               0             0
## 4 Highly Urban/ Urban          0                0               0             0
## 5 Highly Urban/ Urban          0                0               0             0
## 6 Highly Urban/ Urban          0                0               0             0
##   AGE_FLAG HOMEKIDS_FLAG YOJ_FLAG INCOME_FLAG PARENT1_FLAG HOME_VAL_FLAG
## 1        0             0        0           0            0             0
## 2        0             0        0           0            0             0
## 3        0             0        0           0            0             0
## 4        0             0        0           0            0             0
## 5        0             0        0           0            0             0
## 6        0             0        0           0            0             0
##   MSTATUS_FLAG SEX_FLAG EDUCATION_FLAG JOB_FLAG TRAVTIME_FLAG CAR_USE_FLAG
## 1            0        0              0        0             0            0
## 2            0        0              0        0             0            0
## 3            0        0              0        0             0            0
## 4            0        0              0        0             0            0
## 5            0        0              0        0             0            0
## 6            0        0              0        0             0            0
##   BLUEBOOK_FLAG TIF_FLAG CAR_TYPE_FLAG RED_CAR_FLAG OLDCLAIM_FLAG CLM_FREQ_FLAG
## 1             0        0             0            0             0             0
## 2             0        0             0            0             0             0
## 3             0        0             0            0             0             0
## 4             0        0             0            0             0             0
## 5             0        0             0            0             0             0
## 6             0        0             0            0             0             0
##   REVOKED_FLAG MVR_PTS_FLAG CAR_AGE_FLAG URBANICITY_FLAG
## 1            0            0            0               0
## 2            0            0            0               0
## 3            0            0            0               0
## 4            0            0            0               0
## 5            0            0            0               0
## 6            0            0            0               0
  • The paste0(var, “_FLAG”) dynamically creates the name for the new flag column based on the original variable name (e.g., if the original variable is AGE, the flag column will be AGE_FLAG).

  • ifelse(is.na(insurance_training[[var]]), 1, 0) checks if the value is missing (NA), and if it is, it assigns a 1; otherwise, it assigns a 0.

c. Transforming data by putting it into buckets:

In this sub-section, we are going to bucketize the continuous variables; AGE and TARGET_AMT:

# Bucketize AGE into ranges
insurance_training$AGE_BUCKET <- cut(insurance_training$AGE,
                                breaks = c(18, 30, 50, 70, Inf),
                                labels = c("18-30", "31-50", "51-70", "70+"))

# Bucketize TARGET_AMT into categories
insurance_training$TARGET_AMT_BUCKET <- cut(insurance_training$TARGET_AMT,
                                       breaks = c(0, 1000, 5000, 10000, Inf),
                                       labels = c("0-1000", "1001-5000", "5001-10000", "10000+"))

# Check the bucketized varaibles
table(insurance_training$AGE_BUCKET)
## 
## 18-30 31-50 51-70   70+ 
##   400  5638  2105     9
table(insurance_training$TARGET_AMT_BUCKET)
## 
##     0-1000  1001-5000 5001-10000     10000+ 
##        102       1267        629        155

By bucketizing AGE into discrete categories, it makes the variable easier to interpret and analyze. Similarly, bucketizing TARGET_AMT helps transform a continuous variable with potentially high variation into manageable categories. This can help with clearer reporting and analysis of trends.

d. Mathematical transforms such as log or square root (or use Box-Cox):

First and to have a clear decision about the type of transformation based on the skewness of each variable:

# Check skewness for numeric variables
skew_values <- sapply(insurance_training[, c("AGE", "CAR_AGE", "TARGET_AMT", "KIDSDRIV", "HOMEKIDS")], skewness, na.rm = TRUE)

# View skewness values
print(skew_values)
##         AGE     CAR_AGE  TARGET_AMT    KIDSDRIV    HOMEKIDS 
## -0.02899961  0.29120232  8.70630337  3.35183743  1.34112709

Interpretations:

  • AGE: -0.03 This value is close to 0, indicating that the AGE variable is approximately normally distributed. No transformation is needed.

  • CAR_AGE: 0.30 The skewness of CAR_AGE is slightly positive, but it is relatively close to 0, meaning it is only mildly skewed. We may not need a transformation for this variable, as the skewness is not severe.

  • TARGET_AMT: 8.71 This is highly positively skewed, with a skewness greater than 1. This suggests that TARGET_AMT has a long right tail, which is typical for monetary data. A log transformation would be helpful in normalizing this variable.

  • KIDSDRIV: 3.35 This has significant positive skewness, but it’s not extreme. If you want to reduce the skewness, you could consider a log transformation, but it might not be absolutely necessary if the model can handle the skewness well.

  • HOMEKIDS: 1.34 This value also indicates mild positive skewness. Similar to CAR_AGE, no transformation is strictly necessary, but a log transformation could slightly improve the distribution, especially if we are aiming for perfect normality.

Now, based on the skewness above, we only need to log-transform the TARGET_AMT, and the other two variables that have a slight high skewness:

# Apply log transformation to TARGET_AMT amd the others
insurance_training$TARGET_AMT_LOG <- log(insurance_training$TARGET_AMT + 1)

insurance_training$KIDSDRIV_LOG <- log(insurance_training$KIDSDRIV + 1)
insurance_training$HOMEKIDS_LOG <- log(insurance_training$HOMEKIDS + 1)

Let’s check the skewness values after the transformations we performed above:

# Check skewness after applying the transformations
skew_values_after_transformation <- sapply(insurance_training[, c("AGE", "CAR_AGE", "TARGET_AMT_LOG", "KIDSDRIV_LOG", "HOMEKIDS_LOG")], skewness, na.rm = TRUE)

# View the skewness values after transformation
print(skew_values_after_transformation)
##            AGE        CAR_AGE TARGET_AMT_LOG   KIDSDRIV_LOG   HOMEKIDS_LOG 
##    -0.02899961     0.29120232     1.11518775     2.73381482     0.93255965

That is good progress;

  • The log transformation on TARGET_AMT has reduced the skewness significantly, but it remains moderately skewed. This is typical for monetary variables. The transformation has improved the distribution but could still benefit from further adjustments.

  • The transformation on KIDSDRIV has reduced the skewness but it is still quite positive. This suggests that the log transformation helped, but the variable is still somewhat skewed. We should consider another transformation.

  • The log transformation on HOMEKIDS has reduced the skewness to a more acceptable level, bringing it closer to zero. This variable is now much more normally distributed and ready for modeling.

One additional tranformation that can help us normalize the continuous variable TARGET_AMTis Box-Cox Transformation

insurance_training$TARGET_AMT_SHIFTED <- insurance_training$TARGET_AMT + 1
boxcox_result <- boxcox(TARGET_AMT_SHIFTED ~ 1, data = insurance_training)

lambda <- boxcox_result$x[which.max(boxcox_result$y)]
insurance_training$TARGET_AMT_BOXCOX <- (insurance_training$TARGET_AMT_SHIFTED^lambda - 1) / lambda

we can also perform the square root transformation:

insurance_training$TARGET_AMT_SQRT <- sqrt(insurance_training$TARGET_AMT)

Let’s do the same thing for the variable KIDSDRIV:

First, Box-Cox:

insurance_training$KIDSDRIV_BOXCOX <- (insurance_training$KIDSDRIV + 1)^lambda - 1

Then, we can use Cube Root transformation:

insurance_training$KIDSDRIV_CUBE <- sign(insurance_training$KIDSDRIV) * abs(insurance_training$KIDSDRIV)^(1/3)

Let’s check once more for after-transformations-skewness

# Check skewness after applying the transformations
skew_values_after_transformation2 <- sapply(insurance_training[, c("AGE", "CAR_AGE", "TARGET_AMT_BOXCOX", "KIDSDRIV_BOXCOX", "HOMEKIDS_LOG")], skewness, na.rm = TRUE)

# View the skewness values after transformation
print(skew_values_after_transformation2)
##               AGE           CAR_AGE TARGET_AMT_BOXCOX   KIDSDRIV_BOXCOX 
##       -0.02899961        0.29120232        1.07282678       -2.60368149 
##      HOMEKIDS_LOG 
##        0.93255965
# Check skewness after applying the transformations
skew_values_after_transformation3 <- sapply(insurance_training[, c("AGE", "CAR_AGE", "TARGET_AMT_SQRT", "KIDSDRIV_CUBE", "HOMEKIDS_LOG")], skewness, na.rm = TRUE)

# View the skewness values after transformation
print(skew_values_after_transformation3)
##             AGE         CAR_AGE TARGET_AMT_SQRT   KIDSDRIV_CUBE    HOMEKIDS_LOG 
##     -0.02899961      0.29120232      2.34878526      2.43528069      0.93255965

Based on the transformations above:

  • TARGET_AMT: Box-Cox was more effective in reducing skewness compared to the square root or cube transformations. While for KIDSDRIV, Box-Cox made the variable more negatively skewed, whereas cube transformation made it more positively skewed. Neither transformation worked well. So we better keep the _CUBE or find another approach for this variable.

e. Creating New Variables:

Age-based Grouping (AGE_GROUP): Age is a continuous variable, but for the purposes of analysis and modeling, grouping it into categories allows us to better understand trends in different age ranges. For example, it might be valuable to compare the behavior of individuals in their 20s versus those in their 50s when it comes to claims or risk.

# Create age groups
insurance_training$AGE_GROUP <- cut(insurance_training$AGE, 
                               breaks = c(18, 30, 50, Inf), 
                               labels = c("18-30", "31-50", "51+"))

Creating Ratio Variable (KIDSDRIV_RATIO): This gives us a relative measure of how many kids are driving in relation to the parent’s age. This might indicate a trend where younger parents might have fewer kids driving or older parents might have more kids in the driving age range. This may impact outcomes like insurance risk or claim amounts.

# Create a new variable as the ratio of KIDSDRIV to AGE
insurance_training$KIDSDRIV_RATIO <- insurance_training$KIDSDRIV / insurance_training$AGE
# Create a new variable as the ratio of HOMEKIDS to AGE
insurance_training$HOMEKIDS_RATIO <- insurance_training$HOMEKIDS / insurance_training$AGE

3. BUILD MODELS:

3.1 Multiple Linear Regression Models:

3.1.1 Model 1: Using original varaibles

We are going to use the variables; AGE, CAR_AGE, KIDSDRIV_LOG, HOMEKIDS_LOG which are likely to impact the target variable. We use log-transformed TARGET_AMT to handle skewness.

# Multiple Linear Regression - Model 1 (using selected transformed variables)
model1 <- lm(TARGET_AMT_LOG ~ AGE + CAR_AGE + KIDSDRIV_LOG + HOMEKIDS_LOG, data = insurance_training)
summary(model1)
## 
## Call:
## lm(formula = TARGET_AMT_LOG ~ AGE + CAR_AGE + KIDSDRIV_LOG + 
##     HOMEKIDS_LOG, data = insurance_training)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -4.225 -2.273 -1.800  4.265  9.514 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.518424   0.263075  13.374  < 2e-16 ***
## AGE          -0.025437   0.005407  -4.704 2.59e-06 ***
## CAR_AGE      -0.049623   0.007399  -6.707 2.12e-11 ***
## KIDSDRIV_LOG  0.918823   0.161351   5.695 1.28e-08 ***
## HOMEKIDS_LOG  0.326423   0.097973   3.332 0.000867 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.623 on 8156 degrees of freedom
## Multiple R-squared:  0.02669,    Adjusted R-squared:  0.02621 
## F-statistic: 55.91 on 4 and 8156 DF,  p-value: < 2.2e-16

The model above explains only about 2.67% of the variance in the target variable ( \(R^2=0.0267\)), which suggests that while the predictors are statistically significant, they do not account for much of the variability in the target variable.

The significant predictors are AGE, CAR_AGE, KIDSDRIV_LOG, and HOMEKIDS_LOG, with the strongest positive relationship seen in KIDSDRIV_LOG and HOMEKIDS_LOG, while AGE and CAR_AGE have negative relationships with the target variable.

3.1.2. Model 2: Using Transformed Variables

In this model, we’ll use the log-transformed variables for better model stability, which should improve performance by addressing skewness in the data.

summary(insurance_training$AGE)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   16.00   39.00   45.00   44.79   51.00   81.00
summary(insurance_training$CAR_AGE)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -3.000   4.000   8.328   8.328  12.000  28.000
summary(insurance_training$KIDSDRIV)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1711  0.0000  4.0000
summary(insurance_training$HOMEKIDS)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.7212  1.0000  5.0000
any(is.na(insurance_training$AGE))
## [1] FALSE
any(is.na(insurance_training$CAR_AGE))
## [1] FALSE
any(is.na(insurance_training$KIDSDRIV))
## [1] FALSE
any(is.na(insurance_training$HOMEKIDS))
## [1] FALSE
# Multiple Linear Regression - Model 2 (using log-transformed variables)
model2 <- lm(TARGET_AMT_LOG ~ AGE + CAR_AGE + KIDSDRIV_LOG + HOMEKIDS_LOG, 
             data = insurance_training)
summary(model2)
## 
## Call:
## lm(formula = TARGET_AMT_LOG ~ AGE + CAR_AGE + KIDSDRIV_LOG + 
##     HOMEKIDS_LOG, data = insurance_training)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -4.225 -2.273 -1.800  4.265  9.514 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.518424   0.263075  13.374  < 2e-16 ***
## AGE          -0.025437   0.005407  -4.704 2.59e-06 ***
## CAR_AGE      -0.049623   0.007399  -6.707 2.12e-11 ***
## KIDSDRIV_LOG  0.918823   0.161351   5.695 1.28e-08 ***
## HOMEKIDS_LOG  0.326423   0.097973   3.332 0.000867 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.623 on 8156 degrees of freedom
## Multiple R-squared:  0.02669,    Adjusted R-squared:  0.02621 
## F-statistic: 55.91 on 4 and 8156 DF,  p-value: < 2.2e-16

The individual predictors (AGE, CAR_AGE, KIDSDRIV_LOG, and HOMEKIDS_LOG) are statistically significant and have the expected signs in terms of their effect on the target variable (TARGET_AMT_LOG).

However, the model fit is weak (with a low R-squared of 0.02669), indicating that these predictors alone do not explain much of the variability in the target variable. There could be other variables or interactions that are not accounted for, or the relationship between predictors and the target may not be linear.

3.1.3 Model 3: Using Interaction Terms

We introduce interaction terms between variables to explore the combined effects of variables on the target.

# Multiple Linear Regression - Model 3 (including interaction terms)
model3 <- lm(TARGET_AMT_LOG ~ AGE * CAR_AGE + KIDSDRIV_LOG * HOMEKIDS_LOG, data = insurance_training)
summary(model3)
## 
## Call:
## lm(formula = TARGET_AMT_LOG ~ AGE * CAR_AGE + KIDSDRIV_LOG * 
##     HOMEKIDS_LOG, data = insurance_training)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -4.249 -2.248 -1.770  4.240  9.559 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                3.9281839  0.4160701   9.441  < 2e-16 ***
## AGE                       -0.0347230  0.0089855  -3.864 0.000112 ***
## CAR_AGE                   -0.1027284  0.0397057  -2.587 0.009692 ** 
## KIDSDRIV_LOG               1.4434020  0.4781505   3.019 0.002546 ** 
## HOMEKIDS_LOG               0.3354303  0.0998403   3.360 0.000784 ***
## AGE:CAR_AGE                0.0011640  0.0008567   1.359 0.174262    
## KIDSDRIV_LOG:HOMEKIDS_LOG -0.4573381  0.4018289  -1.138 0.255095    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.623 on 8154 degrees of freedom
## Multiple R-squared:  0.02707,    Adjusted R-squared:  0.02635 
## F-statistic: 37.81 on 6 and 8154 DF,  p-value: < 2.2e-16

Adding interaction terms did not substantially improve the model fit. The main effects (AGE, CAR_AGE, KIDSDRIV_LOG, and HOMEKIDS_LOG) remain statistically significant, but the interaction terms are not, suggesting that these interactions do not improve the model’s ability to predict TARGET_AMT_LOG.

Model fit is still weak with low R-squared values, meaning the model is not explaining much of the variability in the target. Further steps could include adding additional predictors or exploring non-linear relationships, or possibly using more advanced models such as random forests or boosting.

3.2. Binary Logistic Regression Models:

Binary logistic regression models predict a binary outcome (0 or 1). We’ll predict the TARGET_FLAG (whether a claim happened: 0 = no, 1 = yes).

3.2.1 Model 1: Using Transformed Variables

We’ll start by using some of the transformed variables that are more normally distributed, as they were transformed to reduce skewness and stabilize the model.

# Logistic Regression - Model 1
log_model1 <- glm(TARGET_FLAG ~ AGE + CAR_AGE + KIDSDRIV_LOG + HOMEKIDS_LOG, 
                  family = binomial(link = "logit"), data = insurance_training)
summary(log_model1)
## 
## Call:
## glm(formula = TARGET_FLAG ~ AGE + CAR_AGE + KIDSDRIV_LOG + HOMEKIDS_LOG, 
##     family = binomial(link = "logit"), data = insurance_training)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -0.143951   0.166943  -0.862  0.38853    
## AGE          -0.017073   0.003469  -4.922 8.58e-07 ***
## CAR_AGE      -0.032706   0.004799  -6.815 9.40e-12 ***
## KIDSDRIV_LOG  0.519439   0.093623   5.548 2.89e-08 ***
## HOMEKIDS_LOG  0.186831   0.060143   3.106  0.00189 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9418.0  on 8160  degrees of freedom
## Residual deviance: 9198.1  on 8156  degrees of freedom
## AIC: 9208.1
## 
## Number of Fisher Scoring iterations: 4
# Check multicollinearity using VIF
vif(log_model1)
##          AGE      CAR_AGE KIDSDRIV_LOG HOMEKIDS_LOG 
##     1.362228     1.029894     1.380320     1.771764

AGE and CAR_AGE both have a negative relationship with the probability of TARGET_FLAG = 1, meaning as age and car age increase, the likelihood of the target outcome decreases.

KIDSDRIV_LOG and HOMEKIDS_LOG both have positive relationships with the target outcome, meaning that as these variables increase, the likelihood of TARGET_FLAG = 1 increases.

The model’s fit is acceptable, but there is room for improvement, as indicated by the residual deviance and AIC.

# Apply log transformation to variables in the evaluation dataset
insurance_evaluation$KIDSDRIV_LOG <- log(insurance_evaluation$KIDSDRIV + 1)
insurance_evaluation$HOMEKIDS_LOG <- log(insurance_evaluation$HOMEKIDS + 1)

- Evaluating the model:

# Residual diagnostics
par(mfrow = c(2, 2))
plot(log_model1)

# Predict on training and evaluation datasets
insurance_training$PRED_TARGET_AMT <- exp(predict(log_model1, newdata = insurance_training)) - 1
insurance_evaluation$PRED_TARGET_AMT <- exp(predict(log_model1, newdata = insurance_evaluation)) - 1

3.2.2 Model 2: Including Interaction Terms

We include interaction terms to explore the effect of variable combinations on the target variable.

# Logistic Regression - Model 3 (including interaction terms + KIDSDRIV_RATIO)
log_model2 <- glm(TARGET_FLAG ~ AGE * CAR_AGE + KIDSDRIV_RATIO + HOMEKIDS_LOG, 
                  family = binomial(link = "logit"), data = insurance_training)
summary(log_model2)
## 
## Call:
## glm(formula = TARGET_FLAG ~ AGE * CAR_AGE + KIDSDRIV_RATIO + 
##     HOMEKIDS_LOG, family = binomial(link = "logit"), data = insurance_training)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -0.0820063  0.2532847  -0.324 0.746111    
## AGE            -0.0185297  0.0055483  -3.340 0.000839 ***
## CAR_AGE        -0.0475421  0.0254602  -1.867 0.061858 .  
## KIDSDRIV_RATIO 10.7397294  2.1750050   4.938  7.9e-07 ***
## HOMEKIDS_LOG    0.2149380  0.0590505   3.640 0.000273 ***
## AGE:CAR_AGE     0.0003346  0.0005585   0.599 0.549066    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9418.0  on 8160  degrees of freedom
## Residual deviance: 9204.3  on 8155  degrees of freedom
## AIC: 9216.3
## 
## Number of Fisher Scoring iterations: 4

The significant predictors in this model are AGE, KIDSDRIV_RATIO, and HOMEKIDS_LOG, indicating they are important in predicting the outcome (TARGET_FLAG).

The model is not greatly improved by the interaction term (AGE:CAR_AGE), suggesting that there is no strong interaction effect between AGE and CAR_AGE.

The CAR_AGE predictor is marginally significant, suggesting a potential relationship, but it is not as strong as the other variables.

3.3.2 Model 3: Including Interaction Terms+ Other

# Logistic Regression - Model 3 (Including Interaction Terms + KIDSDRIV_RATIO)
log_model3 <- glm(TARGET_FLAG ~ AGE * CAR_AGE + KIDSDRIV_RATIO + HOMEKIDS_LOG, 
                          family = binomial(link = "logit"), data = insurance_training)
summary(log_model3)
## 
## Call:
## glm(formula = TARGET_FLAG ~ AGE * CAR_AGE + KIDSDRIV_RATIO + 
##     HOMEKIDS_LOG, family = binomial(link = "logit"), data = insurance_training)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -0.0820063  0.2532847  -0.324 0.746111    
## AGE            -0.0185297  0.0055483  -3.340 0.000839 ***
## CAR_AGE        -0.0475421  0.0254602  -1.867 0.061858 .  
## KIDSDRIV_RATIO 10.7397294  2.1750050   4.938  7.9e-07 ***
## HOMEKIDS_LOG    0.2149380  0.0590505   3.640 0.000273 ***
## AGE:CAR_AGE     0.0003346  0.0005585   0.599 0.549066    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9418.0  on 8160  degrees of freedom
## Residual deviance: 9204.3  on 8155  degrees of freedom
## AIC: 9216.3
## 
## Number of Fisher Scoring iterations: 4

AGE and KIDSDRIV_RATIO are the strongest predictors, with KIDSDRIV_RATIO having a particularly large effect on the outcome.

CAR_AGE has a weaker, marginally significant effect, while HOMEKIDS_LOG also contributes significantly to the model.

The interaction between AGE and CAR_AGE does not significantly improve the model.

4. SELECT MODELS

In this section, we will evaluate the multiple linear regression and binary logistic regression models using various criteria. The goal is to select the models that provide the best balance between performance and interpretability, while also considering the business context and model simplicity. Here, we will explain the criteria used to select the best models, address potential issues such as multi-collinearity, and discuss the relevant model outputs.

4.1 Compare Coefficients:

The key objective for the multiple linear regression model is to find the best model that explains the variability in the target variable (TARGET_AMT_LOG).

Let’s extract Coefficients and Standard Errors:

# Model Evaluation for Multiple Linear Regression - Model 1
# Check for multicollinearity (VIF)
vif(model1)  # Variance Inflation Factor (VIF)
##          AGE      CAR_AGE KIDSDRIV_LOG HOMEKIDS_LOG 
##     1.352025     1.036944     1.359481     1.746139
# Calculate R-squared, Adjusted R-squared, RMSE, and F-statistic
summary(model1)
## 
## Call:
## lm(formula = TARGET_AMT_LOG ~ AGE + CAR_AGE + KIDSDRIV_LOG + 
##     HOMEKIDS_LOG, data = insurance_training)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -4.225 -2.273 -1.800  4.265  9.514 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.518424   0.263075  13.374  < 2e-16 ***
## AGE          -0.025437   0.005407  -4.704 2.59e-06 ***
## CAR_AGE      -0.049623   0.007399  -6.707 2.12e-11 ***
## KIDSDRIV_LOG  0.918823   0.161351   5.695 1.28e-08 ***
## HOMEKIDS_LOG  0.326423   0.097973   3.332 0.000867 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.623 on 8156 degrees of freedom
## Multiple R-squared:  0.02669,    Adjusted R-squared:  0.02621 
## F-statistic: 55.91 on 4 and 8156 DF,  p-value: < 2.2e-16
# Plot residuals
par(mfrow = c(2, 2))
plot(model1)

# RMSE Calculation
rmse_model1 <- sqrt(mean(model1$residuals^2))

# Display results
cat("Adjusted R^2: ", summary(model1)$adj.r.squared, "\n")
## Adjusted R^2:  0.02621179
cat("RMSE: ", rmse_model1, "\n")
## RMSE:  3.621839
cat("F-statistic: ", summary(model1)$fstatistic[1], "\n")
## F-statistic:  55.91137

The model appears to have statistically significant predictors (with very low p-values), but the overall fit is poor as indicated by the low R-squared and adjusted R-squared values. This suggests that while individual predictors like age, car age, and home kids may have a significant relationship with the target variable, the model is not explaining much of the variability in the target variable. Further model refinement or additional predictors may be necessary for a better fit.

4.2 Calculate AIC and Adjusted R²:

# Linear Models
coeff_model1 <- summary(model1)$coefficients
coeff_model2 <- summary(model2)$coefficients
coeff_model3 <- summary(model3)$coefficients

# Logistic Models
coeff_log_model1 <- summary(log_model1)$coefficients
coeff_log_model2 <- summary(log_model2)$coefficients
coeff_log_model3 <- summary(log_model3)$coefficients

# Display coefficients
print("Linear Model 1 Coefficients:")
## [1] "Linear Model 1 Coefficients:"
coeff_model1
##                 Estimate  Std. Error   t value     Pr(>|t|)
## (Intercept)   3.51842381 0.263075364 13.374205 2.273907e-40
## AGE          -0.02543688 0.005407291 -4.704182 2.590791e-06
## CAR_AGE      -0.04962289 0.007399082 -6.706627 2.124461e-11
## KIDSDRIV_LOG  0.91882323 0.161351303  5.694551 1.279887e-08
## HOMEKIDS_LOG  0.32642278 0.097973128  3.331758 8.668289e-04
print("Linear Model 2 Coefficients:")
## [1] "Linear Model 2 Coefficients:"
coeff_model2
##                 Estimate  Std. Error   t value     Pr(>|t|)
## (Intercept)   3.51842381 0.263075364 13.374205 2.273907e-40
## AGE          -0.02543688 0.005407291 -4.704182 2.590791e-06
## CAR_AGE      -0.04962289 0.007399082 -6.706627 2.124461e-11
## KIDSDRIV_LOG  0.91882323 0.161351303  5.694551 1.279887e-08
## HOMEKIDS_LOG  0.32642278 0.097973128  3.331758 8.668289e-04
print("Linear Model 3 Coefficients:")
## [1] "Linear Model 3 Coefficients:"
coeff_model3
##                               Estimate   Std. Error   t value     Pr(>|t|)
## (Intercept)                3.928183890 0.4160700703  9.441159 4.720873e-21
## AGE                       -0.034722973 0.0089855105 -3.864329 1.122585e-04
## CAR_AGE                   -0.102728387 0.0397057240 -2.587244 9.691855e-03
## KIDSDRIV_LOG               1.443402034 0.4781504595  3.018719 2.546304e-03
## HOMEKIDS_LOG               0.335430254 0.0998403333  3.359667 7.839450e-04
## AGE:CAR_AGE                0.001164017 0.0008566794  1.358755 1.742619e-01
## KIDSDRIV_LOG:HOMEKIDS_LOG -0.457338073 0.4018288879 -1.138141 2.550949e-01
print("Logistic Model 1 Coefficients:")
## [1] "Logistic Model 1 Coefficients:"
coeff_log_model1
##                 Estimate  Std. Error   z value     Pr(>|z|)
## (Intercept)  -0.14395111 0.166942609 -0.862279 3.885340e-01
## AGE          -0.01707333 0.003468944 -4.921766 8.576694e-07
## CAR_AGE      -0.03270645 0.004798914 -6.815385 9.401145e-12
## KIDSDRIV_LOG  0.51943937 0.093623332  5.548183 2.886538e-08
## HOMEKIDS_LOG  0.18683124 0.060143125  3.106444 1.893523e-03
print("Logistic Model 2 Coefficients:")
## [1] "Logistic Model 2 Coefficients:"
coeff_log_model2
##                     Estimate   Std. Error    z value     Pr(>|z|)
## (Intercept)    -0.0820063355 0.2532846535 -0.3237714 7.461111e-01
## AGE            -0.0185296892 0.0055483452 -3.3396785 8.387542e-04
## CAR_AGE        -0.0475420986 0.0254601629 -1.8673132 6.185786e-02
## KIDSDRIV_RATIO 10.7397293775 2.1750050091  4.9377952 7.901074e-07
## HOMEKIDS_LOG    0.2149379711 0.0590505149  3.6399000 2.727439e-04
## AGE:CAR_AGE     0.0003346359 0.0005585082  0.5991602 5.490660e-01
# Linear Models
aic_model1 <- AIC(model1)
aic_model2 <- AIC(model2)  # Will be same as model1
aic_model3 <- AIC(model3)

adjusted_r2_model1 <- summary(model1)$adj.r.squared
adjusted_r2_model2 <- summary(model2)$adj.r.squared
adjusted_r2_model3 <- summary(model3)$adj.r.squared

# Logistic Models
aic_log_model1 <- AIC(log_model1)
aic_log_model2 <- AIC(log_model2)
aic_log_model3 <- AIC(log_model3)

# Display results
cat("Linear Models AIC and Adjusted R²:\n")
## Linear Models AIC and Adjusted R²:
cat("Model 1: AIC =", aic_model1, "Adjusted R² =", adjusted_r2_model1, "\n")
## Model 1: AIC = 44178.03 Adjusted R² = 0.02621179
cat("Model 2: AIC =", aic_model2, "Adjusted R² =", adjusted_r2_model2, "\n")
## Model 2: AIC = 44178.03 Adjusted R² = 0.02621179
cat("Model 3: AIC =", aic_model3, "Adjusted R² =", adjusted_r2_model3, "\n")
## Model 3: AIC = 44178.84 Adjusted R² = 0.02635403
cat("\nLogistic Models AIC:\n")
## 
## Logistic Models AIC:
cat("Model 1: AIC =", aic_log_model1, "\n")
## Model 1: AIC = 9208.064
cat("Model 2: AIC =", aic_log_model2, "\n")
## Model 2: AIC = 9216.267
cat("Model 3: AIC =", aic_log_model3, "\n")
## Model 3: AIC = 9216.267

4.3 Select Models Based on Metrics:

Linear Regression Models

  • Model 1 and Model 2:

Both models are identical, as reflected by the same coefficients, AIC, and Adjusted R² values. AIC: 44178.03 Adjusted R²: 0.0262

  • Model 3: Adds interaction terms (AGE:CAR_AGE and KIDSDRIV_LOG:HOMEKIDS_LOG). Slightly higher Adjusted R² (0.0264) compared to Models 1 and 2. Higher AIC (44178.84), suggesting Model 3 doesn’t perform better overall.

  • Decision for Linear Models:

Model 1 or Model 2 is preferred due to lower AIC, simpler structure, and comparable Adjusted R².

Logistic Regression Models

  • Model 1: AIC: 9208.06;

Significant predictors: AGE, CAR_AGE, KIDSDRIV_LOG, HOMEKIDS_LOG (p-values < 0.05).

  • Model 2: Adds AGE:CAR_AGE interaction and KIDSDRIV_RATIO. AIC: 9216.27 (higher than Model 1).

Significant predictors: AGE, KIDSDRIV_RATIO, and HOMEKIDS_LOG.

Interaction term AGE:CAR_AGE is not significant (p = 0.549), indicating no meaningful contribution.

Decision for Logistic Models:

Model 1 is preferred due to lower AIC and a more parsimonious structure.

So based on the above metrics and comparison, our final model selection is: Model1 for both linear regression and logistic regression.

Let’s generate the ROC Curves for better decision:

# Predict probabilities on the training dataset
probabilities <- predict(log_model1, newdata = insurance_training, type = "response")

# Calculate the ROC curve
roc_curve <- roc(insurance_training$TARGET_FLAG, probabilities)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Plot the ROC curve
plot(roc_curve, col = "blue", lwd = 2, 
     main = "Corrected ROC Curve for Logistic Model 1",
     xlab = "False Positive Rate", ylab = "True Positive Rate", 
     xlim = c(0, 1), ylim = c(0, 1))  # Ensure proper axis limits
abline(a = 0, b = 1, lty = 2, col = "red")  # Add diagonal line

# Display the AUC
auc(roc_curve)
## Area under the curve: 0.6095
# Predict probabilities on the training dataset
probabilities2 <- predict(log_model2, newdata = insurance_training, type = "response")

# Calculate the ROC curve
roc_curve2 <- roc(insurance_training$TARGET_FLAG, probabilities2)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Plot the ROC curve
plot(roc_curve2, col = "blue", lwd = 2, 
     main = "Corrected ROC Curve for Logistic Model 2",
     xlab = "False Positive Rate", ylab = "True Positive Rate", 
     xlim = c(0, 1), ylim = c(0, 1))  # Ensure proper axis limits
abline(a = 0, b = 1, lty = 2, col = "red")  # Add diagonal line

# Display the AUC
auc(roc_curve2)
## Area under the curve: 0.6072
# Predict probabilities on the training dataset
probabilities3 <- predict(log_model3, newdata = insurance_training, type = "response")

# Calculate the ROC curve
roc_curve3 <- roc(insurance_training$TARGET_FLAG, probabilities3)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Plot the ROC curve
plot(roc_curve3, col = "blue", lwd = 2, 
     main = "Corrected ROC Curve for Logistic Model 3",
     xlab = "False Positive Rate", ylab = "True Positive Rate", 
     xlim = c(0, 1), ylim = c(0, 1))  # Ensure proper axis limits
abline(a = 0, b = 1, lty = 2, col = "red")  # Add diagonal line

# Display the AUC
auc(roc_curve3)
## Area under the curve: 0.6072