Setting up environment

# Clear the workspace
rm(list = ls()) # Clear environment
gc()            # Clear unused memory
##          used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 530404 28.4    1180741 63.1         NA   669397 35.8
## Vcells 989609  7.6    8388608 64.0      16384  1851879 14.2
cat("\f")       # Clear the console
graphics.off()  # Clear all graphs

Libraries

invisible({
  # List of packages to load
  packages <- c("reader",       # importing data
                "psych",        # quick summary stats for data exploration
                "dplyr",
                "tidyr",
                "scales",
                "glmnet",
                "caret",
                "ggcorrplot",
                "corrplot",
                "moments",
                "mice",         # for imputation of missing values and vis of missing data
                "stargazer",    # summary stats
                "vtable",       # summary stats
                "summarytools", # summary stats
                "naniar",       # for visualisation of missing data
                "visdat",       # for visualisation of missing data
                "VIM",          # for visualisation of missing data
                "DataExplorer", # for visualisation of missing data
                "tidyverse",    # data manipulation like selecting variables
                "fastDummies",  # Create dummy variables using fastDummies
                "corrplot",     # correlation plots
                "ggplot2",      # graphing
                "reshape",
                "data.table",   # reshape for graphing
                "car")          # vif for multicollinearity
  
  # Load all packages
  lapply(packages, library, character.only = TRUE)
})

Importing Datasets

stargazer(df_train, type = "text", title = "Summary of df_train", summary = TRUE)
## 
## Summary of df
## =======================================================
## Statistic     N     Mean    St. Dev.   Min      Max    
## -------------------------------------------------------
## INDEX       6,528 5,156.877 2,986.613   1     10,302   
## TARGET_FLAG 6,528   0.264     0.441     0        1     
## TARGET_AMT  6,528 1,466.616 4,545.654 0.000 107,586.100
## KIDSDRIV    6,528   0.171     0.509     0        4     
## AGE         6,525  44.853     8.651    16       81     
## HOMEKIDS    6,528   0.719     1.106     0        5     
## YOJ         6,153  10.510     4.104     0       23     
## TRAVTIME    6,528  33.442    15.966     5       142    
## TIF         6,528   5.367     4.150     1       25     
## CLM_FREQ    6,528   0.799     1.159     0        5     
## MVR_PTS     6,528   1.700     2.145     0       13     
## CAR_AGE     6,129   8.315     5.725    -3       27     
## -------------------------------------------------------

Data pre-processing

# Drop the "INDEX" column from df_train
df_train <- df_train[, !(names(df_train) %in% "INDEX")]
df_test <- df_test[, !(names(df_test) %in% "INDEX")]
# Replace all blank values with NA for easy identification
df_train[df_train == ""] <- NA
df_test[df_test == ""] <- NA
vis_dat(df_train) +
  theme(
    axis.text.x = element_text(size = 6, angle = 90, hjust = 1),
    axis.text.y = element_text(size = 6)
  )

Imputing the missing values

# Filter rows where the Job column is NA and select the education column
missing_Jobs_education <- df_train %>%
  filter(is.na(JOB)) %>%
  select(EDUCATION) 

# Display the education levels corresponding to missing Jobs
unique_education_levels <- unique(missing_Jobs_education$EDUCATION)
print(unique_education_levels)
## [1] "PhD"     "Masters"
# Analyze Jobs for PhD students
phd_Jobs <- df_train %>%
  filter(EDUCATION == "PhD") %>%  # Replace with actual label used in your dataset
  group_by(JOB) %>%  # Group by Job titles
  summarise(count = n()) %>%  # Count occurrences of each Job
  arrange(desc(count))  # Arrange in descending order

# Display the most common Jobs for PhD students
print(phd_Jobs)
## # A tibble: 6 × 2
##   JOB          count
##   <chr>        <int>
## 1 Doctor         190
## 2 <NA>           160
## 3 Manager        108
## 4 Lawyer          62
## 5 Home Maker      52
## 6 Professional    11
# Analyze Jobs for Master's students
masters_Jobs <- df_train %>%
  filter(EDUCATION == "Masters") %>%  # Replace with actual label used in your dataset
  group_by(JOB) %>%  # Group by Job titles
  summarise(count = n()) %>%  # Count occurrences of each Job
  arrange(desc(count))  # Arrange in descending order

# Display the most common Jobs for Master's students
print(masters_Jobs)
## # A tibble: 6 × 2
##   JOB           count
##   <chr>         <int>
## 1 Lawyer          626
## 2 <NA>            259
## 3 Manager         225
## 4 Professional    151
## 5 Home Maker       69
## 6 z_Blue Collar     1

Most PhD students are doctors and most Masters students are lawyers.

df_train <- df_train %>%
  mutate(JOB = case_when(
    is.na(JOB) & EDUCATION == "PhD" ~ "Lawyer",  # Replace NA with "Lawyer" for PhD
    is.na(JOB) ~ "Doctor",  # Replace remaining NA with "Doctor"
    TRUE ~ JOB  # Keep existing values if not NA
  ))

df_test <- df_test %>%
  mutate(JOB = case_when(
    is.na(JOB) & EDUCATION == "PhD" ~ "Lawyer",  # Replace NA with "Lawyer" for PhD
    is.na(JOB) ~ "Doctor",  # Replace remaining NA with "Doctor"
    TRUE ~ JOB  # Keep existing values if not NA
  ))
summary(df_train[, c("INCOME", "HOME_VAL", "AGE", "YOJ", "CAR_AGE")])
##     INCOME            HOME_VAL              AGE             YOJ       
##  Length:6528        Length:6528        Min.   :16.00   Min.   : 0.00  
##  Class :character   Class :character   1st Qu.:39.00   1st Qu.: 9.00  
##  Mode  :character   Mode  :character   Median :45.00   Median :11.00  
##                                        Mean   :44.85   Mean   :10.51  
##                                        3rd Qu.:51.00   3rd Qu.:13.00  
##                                        Max.   :81.00   Max.   :23.00  
##                                        NA's   :3       NA's   :375    
##     CAR_AGE      
##  Min.   :-3.000  
##  1st Qu.: 1.000  
##  Median : 8.000  
##  Mean   : 8.315  
##  3rd Qu.:12.000  
##  Max.   :27.000  
##  NA's   :399

Imputation and Re-coding

# Convert Income and House_Worth to numeric, removing any non-numeric characters
df_train <- df_train %>%
  mutate(
    INCOME = as.integer(gsub("[^0-9.]", "", INCOME)),
    HOME_VAL = as.integer(gsub("[^0-9.]", "", HOME_VAL))
  )

# Perform the imputation
imputed_data <- mice(df_train, m = 5, method = 'pmm', maxit = 10, seed = 123)

# Complete the dataset with imputed values
df_train <- complete(imputed_data)
# Convert Income and House_Worth to numeric, removing any non-numeric characters
df_test <- df_test %>%
  mutate(
    INCOME = as.integer(gsub("[^0-9.]", "", INCOME)),
    HOME_VAL = as.integer(gsub("[^0-9.]", "", HOME_VAL))
  )

# Perform the imputation
imputed_data <- mice(df_test, m = 5, method = 'pmm', maxit = 10, seed = 123)

# Complete the dataset with imputed values
df_test <- complete(imputed_data)

Now the dataset does not have any more missing values

df_train$SEX        <- dplyr::recode(df_train$SEX,
                                  "M"   = "Male",
                                  "z_F"  = "Female"
                                  )

df_train$CAR_TYPE   <- dplyr::recode(df_train$CAR_TYPE,
                                  "z_SUV"   = "SUV"
                                 )

df_train$EDUCATION  <- dplyr::recode(df_train$EDUCATION,
                                  "z_High School"   = "High School",
                                  "<High School"   = "Before HS"
                                 ) 

df_train$MSTATUS    <- dplyr::recode(df_train$MSTATUS,
                                  "z_No" = "No"
                                  )

df_train$RED_CAR    <- dplyr::recode(df_train$RED_CAR,
                                  "yes"  = "Yes",
                                  "no"   = "No"
                                  )

df_train$URBANICITY <- dplyr::recode(df_train$URBANICITY,
                                  "Highly Urban/ Urban"   = "Urban",
                                  "z_Highly Rural/ Rural" = "Rural"
                                  )

df_train$JOB        <- dplyr::recode(df_train$JOB, 
                               "z_Blue Collar" = "Blue Collar")
# Renaming the col names for easy read
df_train <- df_train |>
  dplyr::rename(Crash_Flag         = TARGET_FLAG,
                Crash_Cost         = TARGET_AMT,
                Teen_Driver        = KIDSDRIV,
                Child_count        = HOMEKIDS,
                Age                = AGE,
                Income             = INCOME,
                Job_Tenure         = YOJ,
                House_Worth        = HOME_VAL,
                Marital_Status     = MSTATUS,
                Sex                = SEX,
                Job                = JOB,
                Car_Purpose        = CAR_USE,
                Bluebook           = BLUEBOOK,
                Time_in_Force      = TIF,
                Car_Type           = CAR_TYPE,
                Red_Car            = RED_CAR,
                Prev_Claims        = OLDCLAIM,
                Education          = EDUCATION,
                Single_Parent      = PARENT1, 
                Revoked            = REVOKED,
                Record_Points      = MVR_PTS,
                Car_Age            = CAR_AGE,
                Urbanicity        = URBANICITY,
                Claim_freq         = CLM_FREQ,
                Travel_Time        = TRAVTIME
         )
# Display the structure of df_train to check data types
str(df_train)
## 'data.frame':    6528 obs. of  25 variables:
##  $ Crash_Flag    : int  0 0 0 0 0 0 0 1 0 0 ...
##  $ Crash_Cost    : num  0 0 0 0 0 ...
##  $ Teen_Driver   : int  0 0 0 0 0 2 0 0 0 0 ...
##  $ Age           : int  40 48 51 56 51 45 65 56 53 49 ...
##  $ Child_count   : int  0 0 0 0 0 2 0 0 1 0 ...
##  $ Job_Tenure    : int  9 0 14 10 9 11 15 13 10 12 ...
##  $ Income        : int  78658 0 89606 199336 161325 102264 68304 44320 74826 61950 ...
##  $ Single_Parent : chr  "No" "No" "No" "No" ...
##  $ House_Worth   : int  236005 75688 288266 501076 429578 0 253131 166535 237354 0 ...
##  $ Marital_Status: chr  "No" "Yes" "No" "Yes" ...
##  $ Sex           : chr  "Female" "Female" "Female" "Female" ...
##  $ Education     : chr  "Masters" "Masters" "Bachelors" "PhD" ...
##  $ Job           : chr  "Lawyer" "Home Maker" "Clerical" "Lawyer" ...
##  $ Travel_Time   : int  69 39 32 23 46 14 39 48 51 26 ...
##  $ Car_Purpose   : chr  "Private" "Private" "Private" "Commercial" ...
##  $ Bluebook      : chr  "$17,040" "$1,500" "$10,460" "$41,280" ...
##  $ Time_in_Force : int  4 6 1 1 6 3 4 1 10 1 ...
##  $ Car_Type      : chr  "Pickup" "Sports Car" "SUV" "Van" ...
##  $ Red_Car       : chr  "No" "No" "No" "No" ...
##  $ Prev_Claims   : chr  "$0" "$3,867" "$21,581" "$0" ...
##  $ Claim_freq    : int  0 3 2 0 0 2 0 1 0 0 ...
##  $ Revoked       : chr  "No" "No" "Yes" "No" ...
##  $ Record_Points : int  1 5 0 1 0 3 1 7 0 0 ...
##  $ Car_Age       : int  15 1 8 16 14 7 11 15 9 10 ...
##  $ Urbanicity    : chr  "Rural" "Urban" "Urban" "Urban" ...
df_train$Single_Parent <- as.factor(df_train$Single_Parent)
df_train$Marital_Status <- as.factor(df_train$Marital_Status)
df_train$Sex <- as.factor(df_train$Sex)
df_train$Education <- as.factor(df_train$Education)
df_train$Job <- as.factor(df_train$Job)
df_train$Car_Purpose <- as.factor(df_train$Car_Purpose)
df_train$Bluebook <- as.factor(df_train$Bluebook)
df_train$Prev_Claims <- as.factor(df_train$Prev_Claims)
df_train$Revoked <- as.factor(df_train$Revoked)
df_train$Urbanicity <- as.factor(df_train$Urbanicity)
df_train$Car_Type <- as.factor(df_train$Car_Type)
df_train$Red_Car <- as.factor(df_train$Red_Car)
# Remove non-numeric characters and convert to integer
df_train$Bluebook <- as.integer(gsub("[^0-9]", "", df_train$Bluebook))
df_train$Prev_Claims <- as.integer(gsub("[^0-9]", "", df_train$Prev_Claims))
vis_dat(df_train) +
  theme(
    axis.text.x = element_text(size = 6, angle = 90, hjust = 1),
    axis.text.y = element_text(size = 6)
  )

# Converting RED_CAR and REVOKED to binary 
df_train$Red_Car <- ifelse(df_train$Red_Car == "Yes", 1, 0)
df_train$Revoked <- ifelse(df_train$Revoked == "Yes", 1, 0)
df_train$Marital_Status <- ifelse(df_train$Marital_Status == "Yes", 1, 0)
df_train$Single_Parent <- ifelse(df_train$Single_Parent == "Yes", 1, 0)

Summary

stargazer(df_train, type = "text", title = "Summary of df_train", summary = TRUE)
## 
## Summary of df
## ==============================================================
## Statistic        N      Mean      St. Dev.    Min      Max    
## --------------------------------------------------------------
## Crash_Flag     6,528    0.264       0.441      0        1     
## Crash_Cost     6,528  1,466.616   4,545.654  0.000 107,586.100
## Teen_Driver    6,528    0.171       0.509      0        4     
## Age            6,528   44.847       8.654     16       81     
## Child_count    6,528    0.719       1.106      0        5     
## Job_Tenure     6,528   10.502       4.116      0       23     
## Income         6,528 61,652.100  47,643.340    0     367,030  
## Single_Parent  6,528    0.131       0.338      0        1     
## House_Worth    6,528 154,219.100 129,179.100   0     885,282  
## Marital_Status 6,528    0.600       0.490      0        1     
## Travel_Time    6,528   33.442      15.966      5       142    
## Bluebook       6,528 15,641.760   8,381.489  1,500   69,740   
## Time_in_Force  6,528    5.367       4.150      1       25     
## Red_Car        6,528    0.292       0.455      0        1     
## Prev_Claims    6,528  4,119.320   8,924.665    0     57,037   
## Claim_freq     6,528    0.799       1.159      0        5     
## Revoked        6,528    0.127       0.333      0        1     
## Record_Points  6,528    1.700       2.145      0       13     
## Car_Age        6,528    8.298       5.742     -3       27     
## --------------------------------------------------------------

Dummification

train_f <- dplyr::select(.data = df_train)
# Check the levels of EDUCATION in the training data
levels(df_train$Education)
## [1] "Bachelors"   "Before HS"   "High School" "Masters"     "PhD"
df_train$Education <- factor(df_train$Education, levels = c("Before HS", "High School", "Bachelors", "Masters", "PhD"))
train_f<- dummy_cols(df_train, 
                               select_columns = c("Education", "Job", "Urbanicity", "Car_Type", "Sex", "Car_Purpose"),
                               remove_first_dummy = TRUE,  
                               remove_selected_columns = TRUE) 
colnames(train_f) <- gsub(" ", "_", colnames(train_f))
stargazer(train_f, type = "text", title = "Summary of df_train", summary = TRUE)
## 
## Summary of df
## =====================================================================
## Statistic               N      Mean      St. Dev.    Min      Max    
## ---------------------------------------------------------------------
## Crash_Flag            6,528    0.264       0.441      0        1     
## Crash_Cost            6,528  1,466.616   4,545.654  0.000 107,586.100
## Teen_Driver           6,528    0.171       0.509      0        4     
## Age                   6,528   44.847       8.654     16       81     
## Child_count           6,528    0.719       1.106      0        5     
## Job_Tenure            6,528   10.502       4.116      0       23     
## Income                6,528 61,652.100  47,643.340    0     367,030  
## Single_Parent         6,528    0.131       0.338      0        1     
## House_Worth           6,528 154,219.100 129,179.100   0     885,282  
## Marital_Status        6,528    0.600       0.490      0        1     
## Travel_Time           6,528   33.442      15.966      5       142    
## Bluebook              6,528 15,641.760   8,381.489  1,500   69,740   
## Time_in_Force         6,528    5.367       4.150      1       25     
## Red_Car               6,528    0.292       0.455      0        1     
## Prev_Claims           6,528  4,119.320   8,924.665    0     57,037   
## Claim_freq            6,528    0.799       1.159      0        5     
## Revoked               6,528    0.127       0.333      0        1     
## Record_Points         6,528    1.700       2.145      0       13     
## Car_Age               6,528    8.298       5.742     -3       27     
## Education_High_School 6,528    0.285       0.451      0        1     
## Education_Bachelors   6,528    0.273       0.445      0        1     
## Education_Masters     6,528    0.204       0.403      0        1     
## Education_PhD         6,528    0.089       0.285      0        1     
## Job_Clerical          6,528    0.161       0.368      0        1     
## Job_Doctor            6,528    0.069       0.253      0        1     
## Job_Home_Maker        6,528    0.078       0.269      0        1     
## Job_Lawyer            6,528    0.130       0.336      0        1     
## Job_Manager           6,528    0.116       0.320      0        1     
## Job_Professional      6,528    0.136       0.343      0        1     
## Job_Student           6,528    0.087       0.282      0        1     
## Urbanicity_Urban      6,528    0.794       0.405      0        1     
## Car_Type_Panel_Truck  6,528    0.081       0.272      0        1     
## Car_Type_Pickup       6,528    0.169       0.375      0        1     
## Car_Type_Sports_Car   6,528    0.110       0.312      0        1     
## Car_Type_SUV          6,528    0.281       0.449      0        1     
## Car_Type_Van          6,528    0.092       0.288      0        1     
## Sex_Male              6,528    0.465       0.499      0        1     
## Car_Purpose_Private   6,528    0.631       0.483      0        1     
## ---------------------------------------------------------------------

Skewness

summarytools::descr(train_f)
## Descriptive Statistics  
## train_f  
## N: 6528  
## 
##                         Age   Bluebook   Car_Age   Car_Purpose_Private   Car_Type_Panel_Truck
## ----------------- --------- ---------- --------- --------------------- ----------------------
##              Mean     44.85   15641.76      8.30                  0.63                   0.08
##           Std.Dev      8.65    8381.49      5.74                  0.48                   0.27
##               Min     16.00    1500.00     -3.00                  0.00                   0.00
##                Q1     39.00    9275.00      1.00                  0.00                   0.00
##            Median     45.00   14370.00      8.00                  1.00                   0.00
##                Q3     51.00   20755.00     12.00                  1.00                   0.00
##               Max     81.00   69740.00     27.00                  1.00                   1.00
##               MAD      8.90    8369.28      7.41                  0.00                   0.00
##               IQR     12.00   11475.00     11.00                  1.00                   0.00
##                CV      0.19       0.54      0.69                  0.76                   3.38
##          Skewness     -0.04       0.82      0.29                 -0.54                   3.08
##       SE.Skewness      0.03       0.03      0.03                  0.03                   0.03
##          Kurtosis     -0.05       0.92     -0.75                 -1.70                   7.50
##           N.Valid   6528.00    6528.00   6528.00               6528.00                6528.00
##         Pct.Valid    100.00     100.00    100.00                100.00                 100.00
## 
## Table: Table continues below
## 
##  
## 
##                     Car_Type_Pickup   Car_Type_Sports_Car   Car_Type_SUV   Car_Type_Van
## ----------------- ----------------- --------------------- -------------- --------------
##              Mean              0.17                  0.11           0.28           0.09
##           Std.Dev              0.38                  0.31           0.45           0.29
##               Min              0.00                  0.00           0.00           0.00
##                Q1              0.00                  0.00           0.00           0.00
##            Median              0.00                  0.00           0.00           0.00
##                Q3              0.00                  0.00           1.00           0.00
##               Max              1.00                  1.00           1.00           1.00
##               MAD              0.00                  0.00           0.00           0.00
##               IQR              0.00                  0.00           1.00           0.00
##                CV              2.21                  2.85           1.60           3.15
##          Skewness              1.76                  2.50           0.98           2.83
##       SE.Skewness              0.03                  0.03           0.03           0.03
##          Kurtosis              1.11                  4.25          -1.05           6.01
##           N.Valid           6528.00               6528.00        6528.00        6528.00
##         Pct.Valid            100.00                100.00         100.00         100.00
## 
## Table: Table continues below
## 
##  
## 
##                     Child_count   Claim_freq   Crash_Cost   Crash_Flag   Education_Bachelors
## ----------------- ------------- ------------ ------------ ------------ ---------------------
##              Mean          0.72         0.80      1466.62         0.26                  0.27
##           Std.Dev          1.11         1.16      4545.65         0.44                  0.45
##               Min          0.00         0.00         0.00         0.00                  0.00
##                Q1          0.00         0.00         0.00         0.00                  0.00
##            Median          0.00         0.00         0.00         0.00                  0.00
##                Q3          1.00         2.00      1067.00         1.00                  1.00
##               Max          5.00         5.00    107586.14         1.00                  1.00
##               MAD          0.00         0.00         0.00         0.00                  0.00
##               IQR          1.00         2.00      1066.00         1.00                  1.00
##                CV          1.54         1.45         3.10         1.67                  1.63
##          Skewness          1.32         1.21         9.12         1.07                  1.02
##       SE.Skewness          0.03         0.03         0.03         0.03                  0.03
##          Kurtosis          0.56         0.31       128.02        -0.85                 -0.96
##           N.Valid       6528.00      6528.00      6528.00      6528.00               6528.00
##         Pct.Valid        100.00       100.00       100.00       100.00                100.00
## 
## Table: Table continues below
## 
##  
## 
##                     Education_High_School   Education_Masters   Education_PhD   House_Worth
## ----------------- ----------------------- ------------------- --------------- -------------
##              Mean                    0.28                0.20            0.09     154219.14
##           Std.Dev                    0.45                0.40            0.29     129179.07
##               Min                    0.00                0.00            0.00          0.00
##                Q1                    0.00                0.00            0.00          0.00
##            Median                    0.00                0.00            0.00     160482.00
##                Q3                    1.00                0.00            0.00     237822.50
##               Max                    1.00                1.00            1.00     885282.00
##               MAD                    0.00                0.00            0.00     148269.64
##               IQR                    1.00                0.00            0.00     237819.25
##                CV                    1.58                1.98            3.19          0.84
##          Skewness                    0.95                1.47            2.88          0.51
##       SE.Skewness                    0.03                0.03            0.03          0.03
##          Kurtosis                   -1.09                0.16            6.29          0.05
##           N.Valid                 6528.00             6528.00         6528.00       6528.00
##         Pct.Valid                  100.00              100.00          100.00        100.00
## 
## Table: Table continues below
## 
##  
## 
##                        Income   Job_Clerical   Job_Doctor   Job_Home_Maker   Job_Lawyer
## ----------------- ----------- -------------- ------------ ---------------- ------------
##              Mean    61652.10           0.16         0.07             0.08         0.13
##           Std.Dev    47643.34           0.37         0.25             0.27         0.34
##               Min        0.00           0.00         0.00             0.00         0.00
##                Q1    27661.00           0.00         0.00             0.00         0.00
##            Median    53276.00           0.00         0.00             0.00         0.00
##                Q3    85637.00           0.00         0.00             0.00         0.00
##               Max   367030.00           1.00         1.00             1.00         1.00
##               MAD    41807.84           0.00         0.00             0.00         0.00
##               IQR    57960.00           0.00         0.00             0.00         0.00
##                CV        0.77           2.28         3.68             3.43         2.59
##          Skewness        1.22           1.84         3.41             3.14         2.20
##       SE.Skewness        0.03           0.03         0.03             0.03         0.03
##          Kurtosis        2.28           1.38         9.61             7.83         2.85
##           N.Valid     6528.00        6528.00      6528.00          6528.00      6528.00
##         Pct.Valid      100.00         100.00       100.00           100.00       100.00
## 
## Table: Table continues below
## 
##  
## 
##                     Job_Manager   Job_Professional   Job_Student   Job_Tenure   Marital_Status
## ----------------- ------------- ------------------ ------------- ------------ ----------------
##              Mean          0.12               0.14          0.09        10.50             0.60
##           Std.Dev          0.32               0.34          0.28         4.12             0.49
##               Min          0.00               0.00          0.00         0.00             0.00
##                Q1          0.00               0.00          0.00         9.00             0.00
##            Median          0.00               0.00          0.00        11.00             1.00
##                Q3          0.00               0.00          0.00        13.00             1.00
##               Max          1.00               1.00          1.00        23.00             1.00
##               MAD          0.00               0.00          0.00         2.97             0.00
##               IQR          0.00               0.00          0.00         4.00             1.00
##                CV          2.76               2.52          3.24         0.39             0.82
##          Skewness          2.40               2.12          2.93        -1.19            -0.41
##       SE.Skewness          0.03               0.03          0.03         0.03             0.03
##          Kurtosis          3.74               2.52          6.59         1.13            -1.83
##           N.Valid       6528.00            6528.00       6528.00      6528.00          6528.00
##         Pct.Valid        100.00             100.00        100.00       100.00           100.00
## 
## Table: Table continues below
## 
##  
## 
##                     Prev_Claims   Record_Points   Red_Car   Revoked   Sex_Male   Single_Parent
## ----------------- ------------- --------------- --------- --------- ---------- ---------------
##              Mean       4119.32            1.70      0.29      0.13       0.46            0.13
##           Std.Dev       8924.67            2.14      0.45      0.33       0.50            0.34
##               Min          0.00            0.00      0.00      0.00       0.00            0.00
##                Q1          0.00            0.00      0.00      0.00       0.00            0.00
##            Median          0.00            1.00      0.00      0.00       0.00            0.00
##                Q3       4663.50            3.00      1.00      0.00       1.00            0.00
##               Max      57037.00           13.00      1.00      1.00       1.00            1.00
##               MAD          0.00            1.48      0.00      0.00       0.00            0.00
##               IQR       4663.25            3.00      1.00      0.00       1.00            0.00
##                CV          2.17            1.26      1.56      2.62       1.07            2.57
##          Skewness          3.07            1.33      0.91      2.24       0.14            2.19
##       SE.Skewness          0.03            0.03      0.03      0.03       0.03            0.03
##          Kurtosis          9.40            1.30     -1.17      3.01      -1.98            2.78
##           N.Valid       6528.00         6528.00   6528.00   6528.00    6528.00         6528.00
##         Pct.Valid        100.00          100.00    100.00    100.00     100.00          100.00
## 
## Table: Table continues below
## 
##  
## 
##                     Teen_Driver   Time_in_Force   Travel_Time   Urbanicity_Urban
## ----------------- ------------- --------------- ------------- ------------------
##              Mean          0.17            5.37         33.44               0.79
##           Std.Dev          0.51            4.15         15.97               0.40
##               Min          0.00            1.00          5.00               0.00
##                Q1          0.00            1.00         22.00               1.00
##            Median          0.00            4.00         33.00               1.00
##                Q3          0.00            7.00         44.00               1.00
##               Max          4.00           25.00        142.00               1.00
##               MAD          0.00            4.45         16.31               0.00
##               IQR          0.00            6.00         22.00               0.00
##                CV          2.98            0.77          0.48               0.51
##          Skewness          3.30            0.88          0.46              -1.45
##       SE.Skewness          0.03            0.03          0.03               0.03
##          Kurtosis         11.23            0.42          0.74               0.11
##           N.Valid       6528.00         6528.00       6528.00            6528.00
##         Pct.Valid        100.00          100.00        100.00             100.00

Talking about some key vars

As we see here,

  1. Central Tendency:

    • The Age column has a mean of 44.85, indicating that the average age is around mid-40s.

    • Income has a mean of 61,652.10, showing the average income for individuals in the dataset.

    • Claim_Freq has a mean of 0.80, implying that most individuals claim insurance infrequently.

  2. Dispersion (Spread):

    • The standard deviations (e.g., 8.65 for Age and 47,643.34 for Income) highlight how widely the data points spread around the mean. For Inome, the spread is quite large, indicating substantial variation in income levels.

    • Car_Age has a relatively smaller standard deviation (5.74), indicating that car ages are more clustered around the mean.

  3. Skewness & Kurtosis:

    • Columns like Claim_Freq, Car_Purpose_Private, and Car_Type_SUV exhibit high skewness, indicating that the data are not symmetrically distributed. For example, Claim_Freq has a skewness of 1.21, suggesting that most individuals have lower claim frequencies, with a few having higher frequencies.

    • Columns like Car_Type_SUV and Record_Points show skewness values greater than 1, indicating a long right tail. The kurtosis values reflect a higher peak or more extreme data compared to normal distribution.

  4. Outliers and Extremes:

    • Claim_Freq shows a maximum of 5, with some extreme outliers in the Crash_Cost column (max value of 107,586.14).

    • Income has a high kurtosis (2.28), reflecting some outliers or a heavy-tailed distribution, particularly with a large range from 0 to 367,030.

  5. Categorical Data Insights:

    • Columns like Car_Type_Panel_Truck, Education_PhD, and Job_Clerical show relatively low means, implying fewer individuals in these categories.

    • Sex_Male shows a 46% male representation, which is close to a balanced gender distribution in the dataset.

Some variables show that there are high-end cars and homes that skew the data significantly.

Visualization

# Shift data to long format
data_long <- melt(df_train)

# Histogram plots of each variable
ggplot(data_long) + 
  geom_histogram(aes(x = value, y = ..density..), bins = 30, fill = 'lightgray', color = 'black') + 
  geom_density(aes(x = value), color = 'blue') + 
  facet_wrap(. ~ variable, scales = 'free', ncol = 5) + 
  theme_minimal() + 
  theme(
    strip.text = element_text(size = 6),  # Adjust size of facet titles
    axis.text.x = element_text(size = 6, angle = 45, hjust = 1), 
    axis.text.y = element_text(size = 5, hjust = 1),
    plot.margin = margin(5, 5, 10, 5)  # Increase margins to prevent cutting off text
  ) +
  scale_y_continuous(labels = scales::comma) +
  scale_x_continuous(labels = scales::comma) +
  labs(title = "Histogram and Density Plots for Each Variable in df_train", 
       x = "Value", 
       y = "Density")

Proportion of Car Crashes by Gender

ggplot(df_train, aes(x = Sex, fill = factor(Crash_Flag))) +
  geom_bar(position = "fill") + 
  labs(title = "Proportion of Car Crashes by Gender",
       x = "Gender",
       y = "Proportion",
       fill = "Car Crash (1 = Yes, 0 = No)") +
  scale_y_continuous(labels = scales::percent) +
  theme_minimal()

Proportion of Car Crashes by Marital Status

ggplot(df_train, aes(x = factor(Marital_Status), fill = factor(Crash_Flag))) + 
  geom_bar(position = "fill") + 
  labs(title = "Proportion of Car Crashes by Marital Status",
       x = "Marital Status",
       y = "Proportion",
       fill = "Car Crash (1 = Yes, 0 = No)") + 
  scale_y_continuous(labels = scales::percent) + 
  scale_x_discrete(labels = c("0" = "Not married", "1" = "Married")) +  # Change 0 to Single and 1 to Married
  theme_minimal()

Proportion of Car Crashes by Job Type

# Gather Job columns into long format
Jobs_long <- train_f %>%
  pivot_longer(cols = starts_with("Job_") & !matches("Job_Tenure"),  # Identify Job columns
               names_to = "Job_Type",      # New column for Job type
               values_to = "Is_Job") %>%   # Column for Job presence (1 if present)
  filter(Is_Job == 1)

ggplot(Jobs_long, aes(x = factor(Job_Type), fill = factor(Crash_Flag))) + 
  geom_bar(position = "fill") + 
  labs(title = "Proportion of Car Crashes by Job Type",
       x = "Job Type",
       y = "Proportion of Car Crashes",
       fill = "Car Crash (1 = Yes, 0 = No)") + 
  scale_y_continuous(breaks = c(0, 0.25, 0.5, 0.75, 1)) +  # Specify the breaks 
  scale_x_discrete(labels = c(  # Custom x-axis labels, excluding "Job_Tenure"
    "Job_Clerical" = "Clerical", 
    "Job_Doctor" = "Doctor", 
    "Job_Home_Maker" = "Home Maker", 
    "Job_Lawyer" = "Lawyer", 
    "Job_Manager" = "Manager", 
    "Job_Professional" = "Professional", 
    "Job_Student" = "Student"
  )) + 
  theme_minimal() + 
  coord_flip()

Claim Amount by Car Type

ggplot(df_train, aes(x = factor(Car_Type), y = Crash_Cost)) +
  geom_boxplot() +
  labs(title = "Claim Amount by Car Type",
       x = "Car Type",
       y = "Claim Amount ($)") +
  theme_minimal() +
  scale_y_continuous(labels = scales::dollar_format())

Proportion of Car Crashes by Urbanicity

ggplot(train_f, aes(x = factor(Urbanicity_Urban), fill = factor(Crash_Flag))) + 
  geom_bar(position = "fill") + 
  labs(title = "Proportion of Car Crashes by Urbanicity",
       x = "Urbanicity",
       y = "Proportion",
       fill = "Car Crash (1 = Yes, 0 = No)") + 
  scale_y_continuous(labels = scales::percent) + 
  scale_x_discrete(labels = c("0" = "Rural", "1" = "Urban")) +  # Change 0 to Rural and 1 to Urban
  theme_minimal()

# Bucket the AGE variable
train_f$AGE_GROUP <- cut(train_f$Age, breaks = c(0, 25, 45, 65, Inf), labels = c("Young", "Adult", "Middle-Aged", "Senior"))

# Bucket the Income variable
train_f$Income_GROUP <- cut(train_f$Income, breaks = c(-Inf, 30000, 60000, 100000, Inf), labels = c("Low", "Medium", "High", "Very High"))

Proportion of car crashes by Age

# Plotting the proportion of car crashes by age bucket
ggplot(train_f, aes(x = AGE_GROUP, fill = factor(Crash_Flag))) +
  geom_bar(position = "fill") +  # Stacked bar plot showing proportions
  labs(title = "Proportion of Car Crashes by Age Bucket",
       x = "Age Group",
       y = "Proportion of Car Crashes",
       fill = "Car Crash (1 = Yes, 0 = No)") +
  scale_y_continuous(labels = scales::percent) +  # Use percentage labels
  theme_minimal()

Proportion of car crashes by Income

# Plotting the proportion of car crashes by Income bucket
ggplot(train_f, aes(x = Income_GROUP, fill = factor(Crash_Flag))) +
  geom_bar(position = "fill") +  # Stacked bar plot showing proportions
  labs(title = "Proportion of Car Crashes by Income Bucket",
       x = "Income Group",
       y = "Proportion of Car Crashes",
       fill = "Car Crash (1 = Yes, 0 = No)") +
  scale_y_continuous(labels = scales::percent) +  # Use percentage labels
  theme_minimal()

Correlation heat map

continuous_vars <- train_f[, c("Income", "Crash_Cost", "Bluebook", "Age", "House_Worth", "Prev_Claims")]
corr_matrix <- cor(continuous_vars, use = "complete.obs")
corrplot(corr_matrix, method = "color", type = "upper", tl.col = "black", tl.srt = 45)

Proportion of Crashes

# Count occurrences of each Crash_Flag value
Crash_Flag_counts <- table(train_f$Crash_Flag)

# Convert the table to a data frame and create the pie chart
ggplot(as.data.frame(Crash_Flag_counts), aes(x = "", y = Freq, fill = factor(Var1))) + 
  geom_bar(stat = "identity", width = 1) + 
  coord_polar("y") + 
  labs(title = "Proportion of Crashes (Crash Flag)", fill = "Crash Status") +   
  scale_fill_manual(values = c("skyblue", "lightcoral"), 
                    labels = c("No Crash", "Crash Occurred")) +  # Change legend labels
  theme_void()

The dataset is heavily imbalanced with lower number of crash occurred instances recorded.

# Dropping columns by name
train_f <- train_f[, !names(train_f) %in% c("Income_GROUP", "AGE_GROUP")]

Correlation matrix

# Select only numeric columns excluding 'INDEX'
numeric_train <- df_train[sapply(df_train, is.numeric)]

# Remove the 'INDEX' column if it exists
numeric_train <- numeric_train[, !colnames(numeric_train) %in% "INDEX"]

# Calculate the correlation matrix
cor_matrix <- cor(numeric_train, use = "complete.obs")

# Calculate the p-values for significance testing
p.mat <- ggcorrplot::cor_pmat(numeric_train)

# Create the correlation plot
ggcorrplot(corr = cor_matrix, 
           method = "square",                 # Shape of the plot
           type = "lower",                    # Display the upper triangle
           title = "Correlation Plot",        # Title of the plot
           colors = c("red", "white", "green"), # Color scheme
           lab = TRUE,                        # Display correlation coefficients
           lab_size = 2,                      # Size of the labels
           p.mat = p.mat,                     # Add p-values for significance
           insig = "pch",                     # Mark insignificant correlations with a symbol
           pch = 4,                           # Use a cross as the symbol
           hc.order = TRUE,                   # Hierarchically cluster the variables
           tl.cex = 10,                       # Adjust size of text labels
           tl.col = "black",                  # Color of text labels
           digits = 2)                        # Number of digits to display

  • Income and House_Worth are strongly positively correlated which makes sense as they are proportionally related

  • Crash_Flag is positively correlated with Record_Points, and Teen_Driver which may suggest that people with high number of tickets or underage driving (most 16 y/o) tend to crash cars more.

  • Single_Parent are negatively correlated with Claim_freq suggesting they might be careful drivers.

Model 1: Multvariate Linear Regression with all variables

# Model 1: Multiple Linear Regression with all variables
model1 <- lm(Crash_Cost ~ . -Crash_Flag, data = train_f)  # Use all variables as predictors
summary(model1)
## 
## Call:
## lm(formula = Crash_Cost ~ . - Crash_Flag, data = train_f)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -5439  -1646   -728    380 103905 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            9.000e+01  5.016e+02   0.179 0.857594    
## Teen_Driver            3.638e+02  1.230e+02   2.957 0.003118 ** 
## Age                    5.352e+00  7.640e+00   0.701 0.483628    
## Child_count            5.888e+01  7.124e+01   0.826 0.408565    
## Job_Tenure            -1.642e+01  1.572e+01  -1.045 0.296172    
## Income                -4.273e-03  1.923e-03  -2.222 0.026335 *  
## Single_Parent          6.539e+02  2.181e+02   2.998 0.002729 ** 
## House_Worth           -3.431e-04  6.288e-04  -0.546 0.585327    
## Marital_Status        -5.010e+02  1.562e+02  -3.207 0.001350 ** 
## Travel_Time            1.100e+01  3.469e+00   3.169 0.001536 ** 
## Bluebook               1.640e-02  9.268e-03   1.769 0.076934 .  
## Time_in_Force         -4.209e+01  1.315e+01  -3.201 0.001377 ** 
## Red_Car               -1.090e+01  1.607e+02  -0.068 0.945953    
## Prev_Claims           -1.559e-02  7.854e-03  -1.985 0.047192 *  
## Claim_freq             1.767e+02  5.932e+01   2.978 0.002908 ** 
## Revoked                6.438e+02  1.834e+02   3.511 0.000450 ***
## Record_Points          1.767e+02  2.805e+01   6.300 3.17e-10 ***
## Car_Age               -2.031e+01  1.320e+01  -1.539 0.123868    
## Education_High_School -1.275e+02  1.841e+02  -0.692 0.488705    
## Education_Bachelors   -2.925e+02  2.180e+02  -1.342 0.179749    
## Education_Masters      2.247e+01  3.177e+02   0.071 0.943634    
## Education_PhD          3.261e+01  3.646e+02   0.089 0.928745    
## Job_Clerical          -2.558e+01  2.055e+02  -0.124 0.900950    
## Job_Doctor            -5.948e+02  3.437e+02  -1.731 0.083550 .  
## Job_Home_Maker        -1.554e+02  2.862e+02  -0.543 0.587124    
## Job_Lawyer            -3.010e+02  3.155e+02  -0.954 0.340141    
## Job_Manager           -8.948e+02  2.551e+02  -3.508 0.000455 ***
## Job_Professional      -2.487e+01  2.301e+02  -0.108 0.913927    
## Job_Student           -1.944e+02  2.505e+02  -0.776 0.437772    
## Urbanicity_Urban       1.618e+03  1.500e+02  10.784  < 2e-16 ***
## Car_Type_Panel_Truck   2.199e+02  2.980e+02   0.738 0.460502    
## Car_Type_Pickup        2.949e+02  1.839e+02   1.604 0.108824    
## Car_Type_Sports_Car    9.439e+02  2.353e+02   4.012 6.08e-05 ***
## Car_Type_SUV           7.778e+02  1.931e+02   4.027 5.71e-05 ***
## Car_Type_Van           2.812e+02  2.294e+02   1.226 0.220423    
## Sex_Male               4.152e+02  1.980e+02   2.097 0.036021 *  
## Car_Purpose_Private   -8.125e+02  1.722e+02  -4.717 2.44e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4387 on 6491 degrees of freedom
## Multiple R-squared:  0.07359,    Adjusted R-squared:  0.06845 
## F-statistic: 14.32 on 36 and 6491 DF,  p-value: < 2.2e-16

The R-squared value of 0.07359 indicates that only about 7.4% of the variance in crash costs is explained by the model, which is quite low. This suggests that while some variables like age, marital status, and car type show significant relationships, the model is missing important factors that could better explain crash costs.

Model 2: MVLR with selected variables

# Log transformation of the Crash_Cost column
train_f$log_Crash_Cost <- log(train_f$Crash_Cost + 1)
# Running Multiple Linear Regression with selected impactful variables
model2_refined <- lm(log_Crash_Cost ~ . -Crash_Flag -Crash_Cost, 
                     data = train_f)

summary(model2_refined)
## 
## Call:
## lm(formula = log_Crash_Cost ~ . - Crash_Flag - Crash_Cost, data = train_f)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.6006 -2.3272 -0.8779  2.1137 10.3352 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            1.182e+00  3.691e-01   3.203 0.001368 ** 
## Teen_Driver            5.822e-01  9.055e-02   6.430 1.37e-10 ***
## Age                   -1.620e-03  5.623e-03  -0.288 0.773296    
## Child_count            5.930e-02  5.244e-02   1.131 0.258156    
## Job_Tenure            -2.935e-02  1.157e-02  -2.536 0.011227 *  
## Income                -4.436e-06  1.415e-06  -3.134 0.001731 ** 
## Single_Parent          6.474e-01  1.605e-01   4.033 5.57e-05 ***
## House_Worth           -9.006e-07  4.628e-07  -1.946 0.051687 .  
## Marital_Status        -6.251e-01  1.150e-01  -5.437 5.62e-08 ***
## Travel_Time            1.519e-02  2.554e-03   5.949 2.84e-09 ***
## Bluebook              -1.692e-05  6.821e-06  -2.481 0.013134 *  
## Time_in_Force         -6.520e-02  9.678e-03  -6.737 1.75e-11 ***
## Red_Car               -3.063e-02  1.183e-01  -0.259 0.795724    
## Prev_Claims           -2.125e-05  5.780e-06  -3.676 0.000239 ***
## Claim_freq             2.681e-01  4.366e-02   6.141 8.68e-10 ***
## Revoked                1.311e+00  1.350e-01   9.716  < 2e-16 ***
## Record_Points          1.853e-01  2.064e-02   8.979  < 2e-16 ***
## Car_Age                1.188e-03  9.715e-03   0.122 0.902691    
## Education_High_School  3.441e-02  1.355e-01   0.254 0.799538    
## Education_Bachelors   -5.328e-01  1.605e-01  -3.320 0.000904 ***
## Education_Masters     -1.784e-01  2.339e-01  -0.763 0.445671    
## Education_PhD         -2.713e-01  2.683e-01  -1.011 0.311991    
## Job_Clerical          -2.478e-03  1.512e-01  -0.016 0.986926    
## Job_Doctor            -8.287e-01  2.530e-01  -3.276 0.001059 ** 
## Job_Home_Maker        -2.056e-01  2.106e-01  -0.976 0.329095    
## Job_Lawyer            -5.853e-01  2.322e-01  -2.521 0.011738 *  
## Job_Manager           -1.183e+00  1.877e-01  -6.303 3.11e-10 ***
## Job_Professional      -2.707e-01  1.694e-01  -1.599 0.109959    
## Job_Student           -1.931e-01  1.843e-01  -1.048 0.294904    
## Urbanicity_Urban       2.415e+00  1.104e-01  21.872  < 2e-16 ***
## Car_Type_Panel_Truck   4.082e-01  2.193e-01   1.861 0.062748 .  
## Car_Type_Pickup        6.251e-01  1.354e-01   4.618 3.95e-06 ***
## Car_Type_Sports_Car    1.180e+00  1.732e-01   6.813 1.04e-11 ***
## Car_Type_SUV           9.671e-01  1.422e-01   6.803 1.12e-11 ***
## Car_Type_Van           5.661e-01  1.689e-01   3.353 0.000805 ***
## Sex_Male               2.046e-01  1.457e-01   1.404 0.160344    
## Car_Purpose_Private   -9.968e-01  1.268e-01  -7.863 4.37e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.229 on 6491 degrees of freedom
## Multiple R-squared:  0.2286, Adjusted R-squared:  0.2243 
## F-statistic: 53.44 on 36 and 6491 DF,  p-value: < 2.2e-16

Performing stepwise selection did not help too.

The updated model with a log-transformed dependent variable (log_Crash_Cost) shows an improved R-squared value of 0.2286, meaning that about 23% of the variation in crash costs is explained by the model. This is a significant improvement compared to the previous model. Some of the key variables, like Teen_Driver, Job_Tenure, Income, and Claim_freq, show meaningful relationships. For example, Teen_Driver has a positive coefficient, meaning that teen drivers tend to have higher crash costs, while Income has a negative coefficient, suggesting that higher income is associated with lower crash costs.

Since the dependent variable is log-transformed, the coefficients represent the percentage change in the crash cost for a one-unit change in the predictor variables.

While the model fits the data better, there is still room for improvement.

Model 3: MVLR with Lasso

# Define Predictors (X) and Response (y) for your dataset
X2 <- model.matrix(log_Crash_Cost ~ Bluebook + Income + Single_Parent + House_Worth + Marital_Status + Sex_Male + Education_High_School + Education_Bachelors + Education_Masters + Travel_Time + Car_Purpose_Private + Teen_Driver + Age + Child_count + Job_Tenure + Car_Type_Panel_Truck + Car_Type_Pickup + Car_Type_Sports_Car + Car_Type_SUV + Car_Type_Van + Prev_Claims + Claim_freq + Revoked + Record_Points + Urbanicity_Urban + Job_Clerical + Job_Doctor + Job_Home_Maker + Job_Lawyer + Job_Manager + Job_Professional + Job_Student + Time_in_Force, 
                   data = train_f)[, -1]  # Remove intercept column

y2 <- train_f$log_Crash_Cost

# Scale the predictor matrix
X2 <- scale(X2)

# Lasso regression with cross-validation to find the optimal lambda
cv_lasso1 <- cv.glmnet(x = X2, 
                       y = y2, 
                       alpha = 1, 
                       family = "gaussian")  # Use "gaussian" for continuous target

# Optimal lambda
best_lambda_lasso1 <- cv_lasso1$lambda.min

# Fitting the Lasso model with the optimal lambda
lasso_model1 <- glmnet(X2, 
                       y2, 
                       alpha = 1, 
                       family = "gaussian", 
                       lambda = best_lambda_lasso1)

# Extract coefficients at the optimal lambda
lasso_coefs1 <- coef(lasso_model1, s = best_lambda_lasso1)

# Convert the coefficients into a df
lasso_coefs1_df <- as.data.frame(as.matrix(lasso_coefs1))
colnames(lasso_coefs1_df) <- "Coefficient"
print(lasso_coefs1_df)
##                        Coefficient
## (Intercept)            2.180892678
## Bluebook              -0.142639893
## Income                -0.231931214
## Single_Parent          0.217919720
## House_Worth           -0.114877469
## Marital_Status        -0.304079188
## Sex_Male               0.085193071
## Education_High_School  0.043792316
## Education_Bachelors   -0.193517820
## Education_Masters     -0.003567308
## Travel_Time            0.239319395
## Car_Purpose_Private   -0.482550156
## Teen_Driver            0.294346368
## Age                   -0.016309811
## Child_count            0.064437539
## Job_Tenure            -0.112701677
## Car_Type_Panel_Truck   0.103548911
## Car_Type_Pickup        0.224838050
## Car_Type_Sports_Car    0.355929652
## Car_Type_SUV           0.419783047
## Car_Type_Van           0.155829477
## Prev_Claims           -0.180884594
## Claim_freq             0.306377994
## Revoked                0.432073715
## Record_Points          0.396450923
## Urbanicity_Urban       0.970942094
## Job_Clerical           0.005960880
## Job_Doctor            -0.230122299
## Job_Home_Maker        -0.058256710
## Job_Lawyer            -0.222863295
## Job_Manager           -0.388368687
## Job_Professional      -0.094499605
## Job_Student           -0.046058792
## Time_in_Force         -0.267902190
W <- as.matrix(coef(lasso_model1))

# Identify predictors with non-zero coefficients, excluding the intercept
keep_X <- rownames(W)[W != 0 & rownames(W) != "(Intercept)"]

# Subset the original data to include only predictors with non-zero coefficients
X_O <- as.matrix(train_f[, keep_X, drop = FALSE])  # Assuming train_f is your dataset

# Combine target variable `y2` with selected predictors into a data frame
df <- cbind.data.frame(y2, X_O)

# Fit a linear model using only the selected predictors, without an intercept
la_eq <- lm(y2 ~ X_O - 1, data = df)

# Output the summary of the linear model
summary(la_eq)
## 
## Call:
## lm(formula = y2 ~ X_O - 1, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.6461 -2.3156 -0.8598  2.1120 10.1235 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## X_OBluebook              -1.122e-05  6.576e-06  -1.707 0.087929 .  
## X_OIncome                -4.190e-06  1.331e-06  -3.149 0.001646 ** 
## X_OSingle_Parent          7.358e-01  1.581e-01   4.655 3.31e-06 ***
## X_OHouse_Worth           -9.637e-07  4.618e-07  -2.087 0.036924 *  
## X_OMarital_Status        -5.847e-01  1.143e-01  -5.116 3.21e-07 ***
## X_OSex_Male               3.068e-01  1.225e-01   2.505 0.012281 *  
## X_OEducation_High_School  2.030e-01  1.168e-01   1.738 0.082205 .  
## X_OEducation_Bachelors   -3.444e-01  1.235e-01  -2.788 0.005324 ** 
## X_OEducation_Masters      5.844e-02  1.409e-01   0.415 0.678342    
## X_OTravel_Time            1.721e-02  2.469e-03   6.969 3.50e-12 ***
## X_OCar_Purpose_Private   -8.984e-01  1.230e-01  -7.306 3.07e-13 ***
## X_OTeen_Driver            5.512e-01  9.005e-02   6.121 9.86e-10 ***
## X_OAge                    8.499e-03  4.540e-03   1.872 0.061220 .  
## X_OChild_count            9.595e-02  5.119e-02   1.875 0.060887 .  
## X_OJob_Tenure            -2.277e-02  1.140e-02  -1.997 0.045860 *  
## X_OCar_Type_Panel_Truck   4.180e-01  2.188e-01   1.911 0.056107 .  
## X_OCar_Type_Pickup        7.311e-01  1.315e-01   5.559 2.82e-08 ***
## X_OCar_Type_Sports_Car    1.307e+00  1.685e-01   7.759 9.87e-15 ***
## X_OCar_Type_SUV           1.104e+00  1.357e-01   8.136 4.85e-16 ***
## X_OCar_Type_Van           6.017e-01  1.685e-01   3.571 0.000358 ***
## X_OPrev_Claims           -2.132e-05  5.783e-06  -3.686 0.000230 ***
## X_OClaim_freq             2.681e-01  4.368e-02   6.138 8.83e-10 ***
## X_ORevoked                1.322e+00  1.350e-01   9.794  < 2e-16 ***
## X_ORecord_Points          1.914e-01  2.056e-02   9.309  < 2e-16 ***
## X_OUrbanicity_Urban       2.492e+00  1.075e-01  23.185  < 2e-16 ***
## X_OJob_Clerical           5.514e-02  1.496e-01   0.369 0.712445    
## X_OJob_Doctor            -9.640e-01  2.176e-01  -4.431 9.53e-06 ***
## X_OJob_Home_Maker        -1.622e-01  1.987e-01  -0.816 0.414455    
## X_OJob_Lawyer            -7.232e-01  1.981e-01  -3.652 0.000263 ***
## X_OJob_Manager           -1.280e+00  1.742e-01  -7.352 2.19e-13 ***
## X_OJob_Professional      -3.348e-01  1.640e-01  -2.041 0.041273 *  
## X_OJob_Student           -7.087e-02  1.787e-01  -0.397 0.691665    
## X_OTime_in_Force         -6.100e-02  9.588e-03  -6.362 2.13e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.231 on 6495 degrees of freedom
## Multiple R-squared:  0.4293, Adjusted R-squared:  0.4264 
## F-statistic:   148 on 33 and 6495 DF,  p-value: < 2.2e-16

The updated model with a formula excluding the intercept (y2 ~ X_O - 1) results in a higher R-squared value of 0.4293, indicating that approximately 43% of the variance in the dependent variable is explained by the predictors.

Some key variables, such as Income, Single_Parent, Marital_Status, Revoked, and Urbanicity_Urban, have strong and statistically significant relationships with the outcome, as indicated by their low p-values.

For instance,

Single_Parent has a coeff of 0.7358. This coefficient suggests that being a single parent increases the log-transformed value of the dependent variable (y2) by 0.7358 units, holding all other factors constant. Since the dependent variable is logged, this implies that a single parent is expected to see an approximately 109.7% increase in the value of the dependent variable. This could reflect an economic scenario where single parents may have less access to support systems or more demanding situations, leading to higher financial vulnerability.

Urbanicity_Urban has a coeff of 2.492. The coefficient suggests that being in an urban area increases the log(crash costs) by 2.492 units, holding all other factors constant. Given that the dependent variable is logged, this corresponds to an approximate 810.9% increase in crash costs for urban residents compared to those in rural areas. Urban areas generally have higher traffic density, more vehicles on the road, and increased likelihood of accidents due to factors such as congestion, distracted driving, or complex road networks. Therefore, it makes sense that crash costs would be significantly higher in urban areas.

The higher R-squared value suggests a better fit compared to the previous model.

Logit Model 1

# Convert the Crash_Flag to a factor with two levels (binary classification)
train_f$Crash_Flag <- as.factor(train_f$Crash_Flag)
# Model 1: Logistic Regression with all low-correlated variables
logit_model1 <- glm(Crash_Flag ~ . -Crash_Cost -log_Crash_Cost -Education_Masters -Education_PhD -Sex_Male -Car_Type_SUV -Car_Type_Sports_Car -Job_Manager, 
                    data = train_f, family = binomial)

summary(logit_model1)
## 
## Call:
## glm(formula = Crash_Flag ~ . - Crash_Cost - log_Crash_Cost - 
##     Education_Masters - Education_PhD - Sex_Male - Car_Type_SUV - 
##     Car_Type_Sports_Car - Job_Manager, family = binomial, data = train_f)
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -1.716e+00  2.931e-01  -5.853 4.82e-09 ***
## Teen_Driver            4.185e-01  6.842e-02   6.116 9.59e-10 ***
## Age                   -5.904e-04  4.432e-03  -0.133 0.894040    
## Child_count            7.521e-02  4.192e-02   1.794 0.072814 .  
## Job_Tenure            -1.970e-02  9.204e-03  -2.141 0.032306 *  
## Income                -5.530e-06  1.134e-06  -4.875 1.09e-06 ***
## Single_Parent          3.645e-01  1.216e-01   2.997 0.002723 ** 
## House_Worth           -9.401e-07  3.744e-07  -2.511 0.012027 *  
## Marital_Status        -5.022e-01  9.260e-02  -5.423 5.86e-08 ***
## Travel_Time            1.403e-02  2.076e-03   6.757 1.41e-11 ***
## Bluebook              -3.407e-05  5.284e-06  -6.448 1.14e-10 ***
## Time_in_Force         -5.579e-02  8.153e-03  -6.843 7.75e-12 ***
## Red_Car               -2.408e-01  7.741e-02  -3.111 0.001865 ** 
## Prev_Claims           -1.511e-05  4.224e-06  -3.578 0.000346 ***
## Claim_freq             2.087e-01  3.144e-02   6.640 3.15e-11 ***
## Revoked                9.465e-01  9.883e-02   9.577  < 2e-16 ***
## Record_Points          1.198e-01  1.513e-02   7.919 2.40e-15 ***
## Car_Age               -1.073e-02  7.094e-03  -1.513 0.130397    
## Education_High_School  8.325e-02  9.230e-02   0.902 0.367129    
## Education_Bachelors   -3.294e-01  9.676e-02  -3.404 0.000664 ***
## Job_Clerical           3.232e-01  1.092e-01   2.961 0.003067 ** 
## Job_Doctor            -1.821e-01  1.615e-01  -1.128 0.259450    
## Job_Home_Maker         1.747e-01  1.529e-01   1.143 0.253166    
## Job_Lawyer             3.741e-02  1.337e-01   0.280 0.779713    
## Job_Professional       1.462e-01  1.138e-01   1.284 0.199155    
## Job_Student            2.531e-03  1.399e-01   0.018 0.985572    
## Urbanicity_Urban       2.293e+00  1.257e-01  18.245  < 2e-16 ***
## Car_Type_Panel_Truck   1.796e-01  1.645e-01   1.092 0.274823    
## Car_Type_Pickup        3.943e-02  9.558e-02   0.413 0.679948    
## Car_Type_Van           2.051e-01  1.296e-01   1.582 0.113679    
## Car_Purpose_Private   -9.778e-01  9.113e-02 -10.729  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7533.1  on 6527  degrees of freedom
## Residual deviance: 5933.7  on 6497  degrees of freedom
## AIC: 5995.7
## 
## Number of Fisher Scoring iterations: 5
# Calculate VIF
vif_values <- vif(logit_model1)
print(vif_values)
##           Teen_Driver                   Age           Child_count 
##              1.351621              1.445736              2.195577 
##            Job_Tenure                Income         Single_Parent 
##              1.448771              2.300350              1.934538 
##           House_Worth        Marital_Status           Travel_Time 
##              1.915093              2.053758              1.035641 
##              Bluebook         Time_in_Force               Red_Car 
##              1.716980              1.011532              1.201895 
##           Prev_Claims            Claim_freq               Revoked 
##              1.633909              1.458158              1.305634 
##         Record_Points               Car_Age Education_High_School 
##              1.161424              1.570936              1.782192 
##   Education_Bachelors          Job_Clerical            Job_Doctor 
##              1.732772              1.589933              1.453315 
##        Job_Home_Maker            Job_Lawyer      Job_Professional 
##              1.644199              1.802142              1.374693 
##           Job_Student      Urbanicity_Urban  Car_Type_Panel_Truck 
##              1.653085              1.137673              2.012575 
##       Car_Type_Pickup          Car_Type_Van   Car_Purpose_Private 
##              1.334408              1.386032              1.960249

Key Variables:

  1. Teen_Driver

    • Coefficient: 0.4185

    • Interpretation: The coefficient suggests that being a teen driver increases the log-odds of having a crash by 0.4185 units. This corresponds to an approximately 51.8% higher odds of having a crash compared to non-teen drivers.

    • Economic Interpretation: Teen drivers are often considered higher risk due to inexperience, which justifies the higher odds of accidents.

  2. Income

    • Coefficient: -5.530e-06

    • Interpretation: For each unit increase in income, the log-odds of having a crash decrease by 0.000005530. This suggests that higher income decreases the odds of a crash, but the effect is quite small.

    • Economic Interpretation: Higher-income individuals may have better vehicles, safer driving habits, or more resources to prevent accidents, leading to a lower likelihood of crashes.

  3. Single_Parent

    • Coefficient: 0.3645

    • Interpretation: Being a single parent increases the log-odds of a crash by 0.3645 units, which corresponds to an approximately 43.9% increase in the odds of having a crash.

    • Economic Interpretation: Single parents may have additional driving stressors, such as transporting children, which could lead to a higher risk of accidents.

  4. Travel_Time

    • Coefficient: 0.01403

    • Interpretation: For each additional unit increase in travel time, the log-odds of having a crash increase by 0.01403. This corresponds to a 1.4% increase in odds for each additional time unit of travel.

    • Economic Interpretation: Longer travel times often correlate with increased exposure to driving risk, such as more time spent on the road and more opportunities for accidents.

  5. Urbanicity_Urban

    • Coefficient: 2.293

    • Interpretation: Living in an urban area increases the log-odds of a crash by 2.293 units. This corresponds to a approximate 9.91 times increase in the odds of having a crash compared to rural areas.

    • Economic Interpretation: Urban areas generally have higher traffic density, congestion, and more driving risks, justifying the much higher odds of accidents in urban settings.

  6. Car_Purpose_Private

    • Coefficient: -0.9778

    • Interpretation: If the car is used for private purposes, the log-odds of having a crash decrease by 0.9778 units. This corresponds to about a 62.5% decrease in the odds of a crash compared to cars used for business purposes.

    • Economic Interpretation: Private use vehicles may be driven less frequently or under safer conditions compared to vehicles used for business purposes, reducing the crash odds.

Logit Model 2

train_f_lasso <- train_f[, !names(train_f) %in% c("Crash_Cost", "log_Crash_Cost")]  # Exclude response and unwanted variables
X <- model.matrix(Crash_Flag ~ ., train_f_lasso)[, -1]  # Convert to matrix and remove intercept column
y <- train_f$Crash_Flag  # Response variable

# Set a range of lambda values for cross-validation
lambda_grid <- 10^seq(3, -3, length = 100)

# Fit the Lasso model using glmnet
lasso_model <- cv.glmnet(X, y, alpha = 1, lambda = lambda_grid, family = "binomial")

# View the best lambda value
lasso_model$lambda.min  # Best lambda based on cross-validation
## [1] 0.001
# Plot the cross-validation curve
plot(lasso_model)

# Get the coefficients for the best lambda
coef(lasso_model, s = "lambda.min")
## 37 x 1 sparse Matrix of class "dgCMatrix"
##                                  s1
## (Intercept)           -2.348478e+00
## Teen_Driver            4.303671e-01
## Age                   -1.618479e-03
## Child_count            4.656802e-02
## Job_Tenure            -1.703011e-02
## Income                -4.143862e-06
## Single_Parent          3.884096e-01
## House_Worth           -8.750704e-07
## Marital_Status        -4.946172e-01
## Travel_Time            1.280118e-02
## Bluebook              -1.915300e-05
## Time_in_Force         -5.429347e-02
## Red_Car                .           
## Prev_Claims           -1.313599e-05
## Claim_freq             1.932873e-01
## Revoked                8.979760e-01
## Record_Points          1.135534e-01
## Car_Age                .           
## Education_High_School  .           
## Education_Bachelors   -4.008179e-01
## Education_Masters     -1.974289e-01
## Education_PhD         -2.578841e-01
## Job_Clerical           8.082546e-02
## Job_Doctor            -3.877766e-01
## Job_Home_Maker        -6.092110e-02
## Job_Lawyer            -2.087108e-01
## Job_Manager           -7.663598e-01
## Job_Professional      -1.062664e-01
## Job_Student           -8.845932e-02
## Urbanicity_Urban       2.294023e+00
## Car_Type_Panel_Truck   3.859180e-01
## Car_Type_Pickup        4.740732e-01
## Car_Type_Sports_Car    8.930570e-01
## Car_Type_SUV           7.182656e-01
## Car_Type_Van           4.841622e-01
## Sex_Male               8.549020e-02
## Car_Purpose_Private   -8.087533e-01
# Predict on the training data using the best lambda
lasso_pred <- predict(lasso_model, X, s = "lambda.min", type = "response")

Here are the key factors with significant effects on crash likelihood:

  1. Teen_Driver (0.43): Teen drivers have a significantly higher likelihood of a crash, increasing the odds by about 43%.

  2. Single_Parent (0.39): Single parents are more likely to be involved in crashes, with a significant positive effect.

  3. Income (-4.14e-06): Higher income slightly decreases the likelihood of a crash.

  4. Marital_Status (-0.49): Married individuals have a lower likelihood of being involved in a crash.

  5. Travel_Time (0.013): Longer travel times significantly increase the likelihood of a crash.

  6. Prev_Claims (-0.000013): Previous claims slightly reduce the likelihood of a future crash.

  7. Revoked (0.90): Having a revoked license significantly increases the likelihood of a crash.

  8. Urbanicity_Urban (2.29): Living in urban areas drastically increases the likelihood of a crash.

# Evaluate model performance (e.g., AUC, accuracy, etc.)
library(pROC)

roc_curve <- roc(y, lasso_pred)
plot(roc_curve, main = "ROC Curve for Lasso Model")

auc(roc_curve)  # AUC value
## Area under the curve: 0.8146
# You can also calculate accuracy, confusion matrix, etc.
lasso_class <- ifelse(lasso_pred > 0.5, 1, 0)
table(Predicted = lasso_class, Actual = y)  # Confusion matrix
##          Actual
## Predicted    0    1
##         0 4457  998
##         1  349  724
# Calculate the confusion matrix from the predicted values
conf_matrix <- table(Predicted = lasso_class, Actual = y)

# Accuracy: Proportion of correct predictions
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
cat("Accuracy: ", accuracy, "\n")
## Accuracy:  0.7936581
# Precision: Proportion of positive predictions that are correct
precision <- conf_matrix[2, 2] / sum(conf_matrix[2, ])
cat("Precision: ", precision, "\n")
## Precision:  0.6747437
# Recall: Proportion of actual positives that are correctly identified
recall <- conf_matrix[2, 2] / sum(conf_matrix[, 2])
cat("Recall: ", recall, "\n")
## Recall:  0.4204413
# F1-Score: Harmonic mean of precision and recall
f1_score <- 2 * (precision * recall) / (precision + recall)
cat("F1-Score: ", f1_score, "\n")
## F1-Score:  0.518068
  1. AUC (0.8146): This suggests that the model is performing reasonably well, as values above 0.8 typically indicate good discrimination between the two classes. However, there may still be room for improvement.

  2. Confusion Matrix:

    • The model is predicting 998 false positives (cases where it predicted 1 but the actual value is 0), which is relatively high compared to the number of true positives (724). This suggests the model may be a bit overconfident in predicting 1s.

    • The number of false negatives (349) is also notable; improving the model might reduce this count (i.e., correctly identifying more of the 1s).

  3. Sparse Coefficients:

    • The “.” in your coefficient list indicates features that have been completely regularized out of the model (i.e., their coefficients were shrunk to zero). For example, Red_Car, Car_Age, Education_High_School, etc., were not deemed important by the Lasso model.

Logit Model 3 with Lasso

# Log Transformations of potentially skewed variables
train_f$log_Income <- log(train_f$Income + 1)  
train_f$log_Age <- log(train_f$Age + 1)  
train_f$log_House_Worth <- log(train_f$House_Worth + 1)  
# Fit the Logistic Regression model with log-transformed variables
logit_model3 <- glm(Crash_Flag ~ . -Crash_Cost -log_Crash_Cost -Income -Age -House_Worth + log_Income + log_Age + log_House_Worth, 
                    data = train_f, 
                    family = binomial)

# Summarize the model
summary(logit_model3)
## 
## Call:
## glm(formula = Crash_Flag ~ . - Crash_Cost - log_Crash_Cost - 
##     Income - Age - House_Worth + log_Income + log_Age + log_House_Worth, 
##     family = binomial, data = train_f)
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -6.324e-01  7.838e-01  -0.807 0.419754    
## Teen_Driver            4.662e-01  6.993e-02   6.667 2.61e-11 ***
## Child_count            1.141e-02  4.283e-02   0.266 0.789910    
## Job_Tenure             7.090e-03  1.158e-02   0.612 0.540495    
## Single_Parent          3.780e-01  1.233e-01   3.066 0.002167 ** 
## Marital_Status        -4.633e-01  9.621e-02  -4.816 1.47e-06 ***
## Travel_Time            1.332e-02  2.110e-03   6.312 2.75e-10 ***
## Bluebook              -2.092e-05  5.826e-06  -3.591 0.000330 ***
## Time_in_Force         -5.600e-02  8.233e-03  -6.803 1.03e-11 ***
## Red_Car               -2.122e-02  9.649e-02  -0.220 0.825963    
## Prev_Claims           -1.538e-05  4.294e-06  -3.583 0.000340 ***
## Claim_freq             2.058e-01  3.181e-02   6.471 9.71e-11 ***
## Revoked                9.349e-01  1.003e-01   9.322  < 2e-16 ***
## Record_Points          1.141e-01  1.532e-02   7.448 9.47e-14 ***
## Car_Age                1.528e-03  8.058e-03   0.190 0.849599    
## Education_High_School -2.106e-02  1.061e-01  -0.199 0.842653    
## Education_Bachelors   -4.902e-01  1.271e-01  -3.858 0.000114 ***
## Education_Masters     -3.086e-01  1.946e-01  -1.586 0.112837    
## Education_PhD         -5.371e-01  2.219e-01  -2.421 0.015476 *  
## Job_Clerical           9.212e-02  1.173e-01   0.785 0.432169    
## Job_Doctor            -5.457e-01  2.109e-01  -2.588 0.009663 ** 
## Job_Home_Maker        -2.718e-01  1.828e-01  -1.487 0.137037    
## Job_Lawyer            -3.307e-01  1.949e-01  -1.697 0.089678 .  
## Job_Manager           -8.898e-01  1.582e-01  -5.624 1.87e-08 ***
## Job_Professional      -2.060e-01  1.347e-01  -1.529 0.126162    
## Job_Student           -4.262e-01  1.627e-01  -2.620 0.008792 ** 
## Urbanicity_Urban       2.384e+00  1.261e-01  18.895  < 2e-16 ***
## Car_Type_Panel_Truck   4.656e-01  1.801e-01   2.585 0.009725 ** 
## Car_Type_Pickup        5.811e-01  1.120e-01   5.186 2.14e-07 ***
## Car_Type_Sports_Car    1.030e+00  1.458e-01   7.063 1.62e-12 ***
## Car_Type_SUV           8.412e-01  1.244e-01   6.764 1.34e-11 ***
## Car_Type_Van           5.541e-01  1.409e-01   3.931 8.44e-05 ***
## Sex_Male               1.610e-01  1.249e-01   1.289 0.197426    
## Car_Purpose_Private   -7.762e-01  9.985e-02  -7.774 7.62e-15 ***
## log_Income            -8.485e-02  1.906e-02  -4.452 8.50e-06 ***
## log_Age               -3.703e-01  1.962e-01  -1.887 0.059114 .  
## log_House_Worth       -2.563e-02  7.521e-03  -3.408 0.000654 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7533.1  on 6527  degrees of freedom
## Residual deviance: 5813.3  on 6491  degrees of freedom
## AIC: 5887.3
## 
## Number of Fisher Scoring iterations: 5
  • Income: The odds ratio of 0.919 suggests that for each 1% increase in income, the odds of having a crash decrease by about 8.1%. Higher income is associated with a lower likelihood of being involved in a crash. Specifically, for every 1% increase in income, the odds of having a crash decrease by about 8.1%. This could imply that individuals with higher income might be less likely to engage in risky driving behaviors, or may have better access to safer vehicles and driving conditions.

  • Teen_Driver and Single_Parent both significantly increase the likelihood of a crash, with odds approximately 1.6 times and 1.46 times higher, respectively.

  • Marital_Status has a protective effect, with married individuals having about 37% lower odds of a crash.

  • Travel_Time and Claim_freq are both positively related to crashes, with small but significant increases in the odds for each additional minute of travel and each additional claim in the past.

  • log_Age: A 1% increase in age reduces the odds of a crash by about 30.2%.

  • log_House_Worth: A 1% increase in house worth reduces the odds of a crash by about 2.5%.

vif(logit_model3)
##           Teen_Driver           Child_count            Job_Tenure 
##              1.376513              2.234487              2.253563 
##         Single_Parent        Marital_Status           Travel_Time 
##              1.919129              2.161893              1.036463 
##              Bluebook         Time_in_Force               Red_Car 
##              2.117413              1.010912              1.826999 
##           Prev_Claims            Claim_freq               Revoked 
##              1.625181              1.453600              1.304097 
##         Record_Points               Car_Age Education_High_School 
##              1.162441              1.997002              2.295532 
##   Education_Bachelors     Education_Masters         Education_PhD 
##              2.900143              5.507244              3.187359 
##          Job_Clerical            Job_Doctor        Job_Home_Maker 
##              1.807290              2.524364              2.262587 
##            Job_Lawyer           Job_Manager      Job_Professional 
##              3.829665              1.839548              1.915930 
##           Job_Student      Urbanicity_Urban  Car_Type_Panel_Truck 
##              2.185151              1.145092              2.414511 
##       Car_Type_Pickup   Car_Type_Sports_Car          Car_Type_SUV 
##              1.822483              2.142494              3.069440 
##          Car_Type_Van              Sex_Male   Car_Purpose_Private 
##              1.639218              3.662596              2.304570 
##            log_Income               log_Age       log_House_Worth 
##              3.572014              1.508095              1.780199
# Perform 10-fold cross-validation
train_control <- trainControl(method = "cv", number = 10)  # 10-fold cross-validation
# Train the model with cross-validation
cv_model3 <- train(Crash_Flag ~ . -Crash_Cost -log_Crash_Cost -Income -Age -House_Worth + log_Income + log_Age + log_House_Worth, 
                   data = train_f, 
                   method = "glm", 
                   family = "binomial", 
                   trControl = train_control)

# Summary of the cross-validation results
print(cv_model3)
## Generalized Linear Model 
## 
## 6528 samples
##   41 predictor
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 5875, 5875, 5875, 5875, 5876, 5875, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.7882978  0.3824096
# Make predictions on the training data (after cross-validation)
lasso_pred3 <- predict(cv_model3, newdata = train_f, type = "prob")

# Extract probabilities for the positive class (class 1)
lasso_pred3_class <- lasso_pred3[, 2]

# Predict the class (1 if probability > 0.5, else 0)
lasso_class3 <- ifelse(lasso_pred3_class > 0.5, 1, 0)

# Confusion Matrix for model 3
conf_matrix3 <- table(Predicted = lasso_class3, Actual = train_f$Crash_Flag)
# Calculate accuracy, precision, recall, and F1-score for model 3
accuracy3 <- sum(diag(conf_matrix3)) / sum(conf_matrix3)
precision3 <- conf_matrix3[2, 2] / sum(conf_matrix3[2, ])
recall3 <- conf_matrix3[2, 2] / sum(conf_matrix3[, 2])
f1_score3 <- 2 * (precision3 * recall3) / (precision3 + recall3)

cat("Model 3 Accuracy: ", accuracy3, "\n")
## Model 3 Accuracy:  0.7930453
cat("Model 3 Precision: ", precision3, "\n")
## Model 3 Precision:  0.6697164
cat("Model 3 Recall: ", recall3, "\n")
## Model 3 Recall:  0.4250871
cat("Model 3 F1-Score: ", f1_score3, "\n")
## Model 3 F1-Score:  0.520071

We see that this model give a slightly higher accuracy. So we will proceed with this.

Logit Model 4 with Ridge

# Exclude the response and unwanted variables
train_f_ridge <- train_f[, !names(train_f) %in% c("Crash_Cost", "log_Crash_Cost")]  # Exclude response and unwanted variables
X <- model.matrix(Crash_Flag ~ ., train_f_ridge)[, -1]  # Convert to matrix and remove intercept column
y <- train_f$Crash_Flag  # Response variable

# Set a range of lambda values for cross-validation
lambda_grid <- 10^seq(3, -3, length = 100)

# Fit the Ridge model using glmnet (alpha = 0 for Ridge)
ridge_model <- cv.glmnet(X, y, alpha = 0, lambda = lambda_grid, family = "binomial")

# View the best lambda value
ridge_model$lambda.min  # Best lambda based on cross-validation
## [1] 0.001

# Plot the cross-validation curve
plot(ridge_model)


# Get the coefficients for the best lambda
coef(ridge_model, s = "lambda.min")
## 40 x 1 sparse Matrix of class "dgCMatrix"
##                                  s1
## (Intercept)            9.398596e+00
## Teen_Driver            4.930501e-01
## Age                    8.662841e-02
## Child_count            2.305280e-02
## Job_Tenure            -2.077821e-03
## Income                -4.079642e-06
## Single_Parent          3.365570e-01
## House_Worth            4.461192e-07
## Marital_Status        -4.834584e-01
## Travel_Time            1.307039e-02
## Bluebook              -2.052077e-05
## Time_in_Force         -5.595448e-02
## Red_Car               -4.182434e-02
## Prev_Claims           -1.464760e-05
## Claim_freq             2.010904e-01
## Revoked                9.227511e-01
## Record_Points          1.098507e-01
## Car_Age                2.392477e-03
## Education_High_School -6.204068e-03
## Education_Bachelors   -4.347815e-01
## Education_Masters     -2.351675e-01
## Education_PhD         -3.622559e-01
## Job_Clerical           4.839124e-02
## Job_Doctor            -5.106872e-01
## Job_Home_Maker        -3.079144e-01
## Job_Lawyer            -3.007698e-01
## Job_Manager           -8.590562e-01
## Job_Professional      -1.870325e-01
## Job_Student           -4.741720e-01
## Urbanicity_Urban       2.339567e+00
## Car_Type_Panel_Truck   5.150674e-01
## Car_Type_Pickup        5.668864e-01
## Car_Type_Sports_Car    9.293479e-01
## Car_Type_SUV           7.678674e-01
## Car_Type_Van           5.812758e-01
## Sex_Male               1.037209e-01
## Car_Purpose_Private   -7.818775e-01
## log_Income            -5.341475e-02
## log_Age               -4.002900e+00
## log_House_Worth       -3.255051e-02

# Predict on the training data using the best lambda
ridge_pred <- predict(ridge_model, X, s = "lambda.min", type = "response")

# Evaluate model performance (e.g., AUC, accuracy, etc.)
library(pROC)

roc_curve_ridge <- roc(y, ridge_pred)
plot(roc_curve_ridge, main = "ROC Curve for Ridge Model")

auc(roc_curve_ridge)  # AUC value
## Area under the curve: 0.818

# You can also calculate accuracy, confusion matrix, etc.
ridge_class <- ifelse(ridge_pred > 0.5, 1, 0)
table(Predicted = ridge_class, Actual = y)  # Confusion matrix
##          Actual
## Predicted    0    1
##         0 4447  988
##         1  359  734

# Calculate the confusion matrix from the predicted values
conf_matrix_ridge <- table(Predicted = ridge_class, Actual = y)

# Accuracy: Proportion of correct predictions
accuracy_ridge <- sum(diag(conf_matrix_ridge)) / sum(conf_matrix_ridge)
cat("Accuracy (Ridge): ", accuracy_ridge, "\n")
## Accuracy (Ridge):  0.7936581

# Precision: Proportion of positive predictions that are correct
precision_ridge <- conf_matrix_ridge[2, 2] / sum(conf_matrix_ridge[2, ])
cat("Precision (Ridge): ", precision_ridge, "\n")
## Precision (Ridge):  0.6715462

# Recall: Proportion of actual positives that are correctly identified
recall_ridge <- conf_matrix_ridge[2, 2] / sum(conf_matrix_ridge[, 2])
cat("Recall (Ridge): ", recall_ridge, "\n")
## Recall (Ridge):  0.4262485

# F1-Score: Harmonic mean of precision and recall
f1_score_ridge <- 2 * (precision_ridge * recall_ridge) / (precision_ridge + recall_ridge)
cat("F1-Score (Ridge): ", f1_score_ridge, "\n")
## F1-Score (Ridge):  0.521492

Selection

For MVLR - Model 2

# Predicting on the training data using la_eq model
pred_train <- predict(la_eq, newdata = train_f)

# Calculate Mean Squared Error (MSE)
mse <- mean((train_f$log_Crash_Cost - pred_train)^2)
cat("Mean Squared Error (MSE):", mse, "\n")
## Mean Squared Error (MSE): 10.38533
# Calculate R-squared
rss <- sum((train_f$log_Crash_Cost - pred_train)^2)
tss <- sum((train_f$log_Crash_Cost - mean(train_f$log_Crash_Cost))^2)
r_squared <- 1 - rss/tss
cat("R-squared:", r_squared, "\n")
## R-squared: 0.2272999
# Calculate Adjusted R-squared
n <- nrow(train_f)  # number of observations
p <- ncol(train_f)  # number of predictors
adjusted_r_squared <- 1 - (1 - r_squared) * (n - 1) / (n - p - 1)
cat("Adjusted R-squared:", adjusted_r_squared, "\n")
## Adjusted R-squared: 0.2222955
# Calculate F-statistic
f_statistic <- (r_squared / (p - 1)) / ((1 - r_squared) / (n - p))
cat("F-statistic:", f_statistic, "\n")
## F-statistic: 46.53517

Residual Analysis

# Residuals vs Fitted Values plot
plot(pred_train, train_f$log_Crash_Cost - pred_train, 
     main = "Residuals vs Fitted Values", 
     xlab = "Fitted Values", 
     ylab = "Residuals")
abline(h = 0, col = "red")

# Normal Q-Q Plot to check if residuals are normally distributed
qqnorm(train_f$log_Crash_Cost - pred_train)
qqline(train_f$log_Crash_Cost - pred_train, col = "red")

# Scale-Location plot
plot(pred_train, sqrt(abs(train_f$log_Crash_Cost - pred_train)), 
     main = "Scale-Location Plot", 
     xlab = "Fitted Values", 
     ylab = "Sqrt of |Residuals|")
abline(h = 0, col = "red")

# Residuals vs Leverage plot
plot(hatvalues(la_eq), train_f$log_Crash_Cost - pred_train, 
     main = "Residuals vs Leverage", 
     xlab = "Leverage", 
     ylab = "Residuals")
abline(h = 0, col = "red")

Discrete-Like Distribution: Crash_Cost values are clustered or concentrated around specific points (rather than exhibiting a smooth continuous distribution), the model will have difficulty predicting variations in this variable. It lacks meaningful variation which high values of crash cost = 0 due to the imbalanced nature of the dataset.

Test dataset

Preparing test dataset

df_test$SEX        <- dplyr::recode(df_test$SEX,
                                  "M"   = "Male",
                                  "z_F"  = "Female"
                                  )

df_test$CAR_TYPE   <- dplyr::recode(df_test$CAR_TYPE,
                                  "z_SUV"   = "SUV"
                                 )

df_test$EDUCATION  <- dplyr::recode(df_test$EDUCATION,
                                  "z_High School"   = "High School",
                                  "<High School"   = "Before HS"
                                 ) 

df_test$MSTATUS    <- dplyr::recode(df_test$MSTATUS,
                                  "z_No" = "No"
                                  )

df_test$RED_CAR    <- dplyr::recode(df_test$RED_CAR,
                                  "yes"  = "Yes",
                                  "no"   = "No"
                                  )

df_test$URBANICITY <- dplyr::recode(df_test$URBANICITY,
                                  "Highly Urban/ Urban"   = "Urban",
                                  "z_Highly Rural/ Rural" = "Rural"
                                  )

df_test$JOB        <- dplyr::recode(df_test$JOB, 
                               "z_Blue Collar" = "Blue Collar")
# Renaming the col names for easy read
df_test <- df_test |>
  dplyr::rename(Crash_Flag = TARGET_FLAG,
                Crash_Cost = TARGET_AMT,
                Teen_Driver = KIDSDRIV,
                Child_count = HOMEKIDS,
                Income = INCOME,
                Job_Tenure = YOJ,
                House_Worth = HOME_VAL,
                Marital_Status = MSTATUS,
                Sex = SEX,
                Job = JOB,
                Age = AGE,
                Car_Purpose = CAR_USE,
                Bluebook = BLUEBOOK,
                Time_in_Force = TIF,
                Car_Type = CAR_TYPE,
                Red_Car= RED_CAR,
                Prev_Claims = OLDCLAIM,
                Education = EDUCATION,
                Single_Parent = PARENT1, 
                Revoked = REVOKED,
                Record_Points = MVR_PTS,
                Car_Age = CAR_AGE,
                Urbanicity    = URBANICITY,
                Claim_freq    = CLM_FREQ,
                Travel_Time = TRAVTIME
         )
df_test$Single_Parent <- as.factor(df_test$Single_Parent)
df_test$Marital_Status <- as.factor(df_test$Marital_Status)
df_test$Sex <- as.factor(df_test$Sex)
df_test$Education <- as.factor(df_test$Education)
df_test$Job <- as.factor(df_test$Job)
df_test$Car_Purpose <- as.factor(df_test$Car_Purpose)
df_test$Bluebook <- as.factor(df_test$Bluebook)
df_test$Prev_Claims <- as.factor(df_test$Prev_Claims)
df_test$Revoked <- as.factor(df_test$Revoked)
df_test$Urbanicity <- as.factor(df_test$Urbanicity)
df_test$Car_Type <- as.factor(df_test$Car_Type)
df_test$Red_Car <- as.factor(df_test$Red_Car)

# Remove non-numeric characters and convert to integer
df_test$Bluebook <- as.integer(gsub("[^0-9]", "", df_test$Bluebook))
df_test$Prev_Claims <- as.integer(gsub("[^0-9]", "", df_test$Prev_Claims))
vis_dat(df_test) +
  theme(
    axis.text.x = element_text(size = 6, angle = 90, hjust = 1),
    axis.text.y = element_text(size = 6)
  )

# Converting RED_CAR and REVOKED to binary 
df_test$Red_Car <- ifelse(df_test$Red_Car == "Yes", 1, 0)
df_test$Revoked <- ifelse(df_test$Revoked == "Yes", 1, 0)
df_test$Marital_Status <- ifelse(df_test$Marital_Status == "Yes", 1, 0)
df_test$Single_Parent <- ifelse(df_test$Single_Parent == "Yes", 1, 0)
# Check the levels of Education in the training data
levels(df_test$Education)
## [1] "Bachelors"   "Before HS"   "High School" "Masters"     "PhD"
df_test$Education <- factor(df_test$Education, levels = c("Before HS", "High School", "Bachelors", "Masters", "PhD"))
test_f<- dummy_cols(df_test, 
                               select_columns = c("Education", "Job", "Urbanicity", "Car_Type", "Sex", "Car_Purpose"),
                               remove_first_dummy = TRUE,  
                               remove_selected_columns = TRUE) 
colnames(test_f) <- gsub(" ", "_", colnames(test_f))
stargazer(test_f, type = "text", title = "Summary of df_test", summary = TRUE)
## 
## Summary of df
## ====================================================================
## Statistic               N      Mean      St. Dev.    Min     Max    
## --------------------------------------------------------------------
## Crash_Flag            1,633    0.264       0.441      0       1     
## Crash_Cost            1,633  1,655.067   5,288.826  0.000 73,783.470
## Teen_Driver           1,633    0.172       0.523      0       4     
## Age                   1,633   44.524       8.535     16       72    
## Child_count           1,633    0.731       1.156      0       5     
## Job_Tenure            1,633   10.426       4.056      0       19    
## Income                1,633 62,536.720  47,106.420    0    290,846  
## Single_Parent         1,633    0.135       0.342      0       1     
## House_Worth           1,633 158,398.200 130,443.400   0    682,634  
## Marital_Status        1,633    0.596       0.491      0       1     
## Travel_Time           1,633   33.659      15.679      5      113    
## Bluebook              1,633 15,982.280   8,568.104  1,500   50,970  
## Time_in_Force         1,633    5.289       4.134      1       22    
## Red_Car               1,633    0.288       0.453      0       1     
## Prev_Claims           1,633  3,708.301   8,155.113    0     53,477  
## Claim_freq            1,633    0.798       1.158      0       5     
## Revoked               1,633    0.104       0.305      0       1     
## Record_Points         1,633    1.677       2.156      0       13    
## Car_Age               1,633    8.409       5.645      0       28    
## Education_High_School 1,633    0.288       0.453      0       1     
## Education_Bachelors   1,633    0.282       0.450      0       1     
## Education_Masters     1,633    0.200       0.400      0       1     
## Education_PhD         1,633    0.089       0.285      0       1     
## Job_Clerical          1,633    0.133       0.340      0       1     
## Job_Doctor            1,633    0.077       0.266      0       1     
## Job_Home_Maker        1,633    0.079       0.270      0       1     
## Job_Lawyer            1,633    0.113       0.317      0       1     
## Job_Manager           1,633    0.141       0.348      0       1     
## Job_Professional      1,633    0.141       0.348      0       1     
## Job_Student           1,633    0.088       0.284      0       1     
## Urbanicity_Urban      1,633    0.802       0.398      0       1     
## Car_Type_Panel_Truck  1,633    0.092       0.289      0       1     
## Car_Type_Pickup       1,633    0.173       0.379      0       1     
## Car_Type_Sports_Car   1,633    0.118       0.322      0       1     
## Car_Type_SUV          1,633    0.283       0.451      0       1     
## Car_Type_Van          1,633    0.093       0.291      0       1     
## Sex_Male              1,633    0.460       0.499      0       1     
## Car_Purpose_Private   1,633    0.620       0.486      0       1     
## --------------------------------------------------------------------

Crash cost prediction on test set

# Log transformation of the Crash_Cost column
test_f$log_Crash_Cost <- log(test_f$Crash_Cost + 1)
# Apply the same dummy transformation to the test set as done for the training set
test_lasso <- test_f[, !names(test_f) %in% c("Crash_Cost", "log_Crash_Cost")]  
X_test_lasso <- model.matrix(log_Crash_Cost ~ Bluebook + Income + Single_Parent + House_Worth + Marital_Status + Sex_Male + Education_High_School + Education_Bachelors + Education_Masters + Travel_Time + Car_Purpose_Private + Teen_Driver + Age + Child_count + Job_Tenure + Car_Type_Panel_Truck + Car_Type_Pickup + Car_Type_Sports_Car + Car_Type_SUV + Car_Type_Van + Prev_Claims + Claim_freq + Revoked + Record_Points + Urbanicity_Urban + Job_Clerical + Job_Doctor + Job_Home_Maker + Job_Lawyer + Job_Manager + Job_Professional + Job_Student + Time_in_Force, 
                    data = test_f)[, -1]  # Remove intercept column

# Scale the test data predictors using the scaling parameters from the training set
X_test_lasso <- scale(X_test_lasso, center = attr(X2, "scaled:center"), scale = attr(X2, "scaled:scale"))
# Make predictions using the linear model `la_eq`
predictions_test <- predict(la_eq, newdata = as.data.frame(X_test_lasso))

# Display predictions
head(predictions_test)
##          1          2          3          4          5          6 
## -0.3102340  4.5151068  3.6404360  0.7792394  0.6396287  3.4613205
# Actual values
actual_values <- test_f$Crash_Cost

# Calculate Mean Squared Error (MSE)
mse <- mean((predictions_test - actual_values)^2)
cat("Mean Squared Error (MSE):", mse, "\n")
## Mean Squared Error (MSE): 30705078
# F-statistic from the model summary
f_statistic <- summary(la_eq)$fstatistic[1]
cat("F-statistic:", f_statistic, "\n")
## F-statistic: 148.0358
# R-squared value from the linear model summary
r_squared <- summary(la_eq)$r.squared
cat("R-squared:", r_squared, "\n")
## R-squared: 0.429271

The model explains about 43% of the variability in Crash_Cost (R-squared = 0.4293), indicating moderate predictive power. While the F-statistic suggests the model is statistically significant, the high Mean Squared Error (MSE = 30,705,078) indicates that the predictions deviate considerably from actual values, pointing to room for improvement in model accuracy and feature selection.

#Reverse form log
crash_cost_predictions <- exp(predictions_test)

We see that some continuous value close to 0 has been assigned to rows where crashes did not occur. So we will not include them as part of our predictions.

# Set predictions to 0 where Crash_Flag is 0
predictions_test[test_f$Crash_Flag == 0] <- 0
# Filter predictions for rows where Crash_Flag is not 0
predictions_non_zero_flag <- predictions_test[test_f$Crash_Flag != 0]

# Plot the histogram
hist(predictions_non_zero_flag, main = "Histogram of Crash Cost Predictions",
     xlab = "Predicted Crash Cost", col = "blue", border = "black", breaks = 30)

Crash occurrence pred on test set

# Prepare the test set (ensure it matches the training set structure)
test_lasso <- test_f[, !names(test_f) %in% c("Crash_Cost", "log_Crash_Cost")]  # Exclude the target variable and unwanted columns
X_test <- model.matrix(Crash_Flag ~ ., test_lasso)[, -1]  # Convert test data to matrix, remove intercept column

# Use the Lasso model to make predictions on the test set
lasso_pred_test <- predict(lasso_model, newx = X_test, s = "lambda.min", type = "response")

# Convert probabilities to class labels (threshold = 0.5 for classification)
lasso_class_test <- ifelse(lasso_pred_test > 0.5, 1, 0)

# View the confusion matrix for model performance
conf_matrix_test <- table(Predicted = lasso_class_test, Actual = test_f$Crash_Flag)
print(conf_matrix_test)
##          Actual
## Predicted    0    1
##         0 1114  252
##         1   88  179
# Calculate Accuracy, Precision, Recall, F1-Score for the test data
accuracy_test <- sum(diag(conf_matrix_test)) / sum(conf_matrix_test)
precision_test <- conf_matrix_test[2, 2] / sum(conf_matrix_test[2, ])
recall_test <- conf_matrix_test[2, 2] / sum(conf_matrix_test[, 2])
f1_score_test <- 2 * (precision_test * recall_test) / (precision_test + recall_test)

cat("Test Accuracy: ", accuracy_test, "\n")
## Test Accuracy:  0.7917942
cat("Test Precision: ", precision_test, "\n")
## Test Precision:  0.670412
cat("Test Recall: ", recall_test, "\n")
## Test Recall:  0.4153132
cat("Test F1-Score: ", f1_score_test, "\n")
## Test F1-Score:  0.512894
# Classification Error Rate (Error Rate = 1 - Accuracy)
classification_error_rate <- 1 - accuracy_test
cat("Classification Error Rate: ", classification_error_rate, "\n")
## Classification Error Rate:  0.2082058
# Sensitivity (True Positive Rate)
sensitivity_test <- conf_matrix_test[2, 2] / (conf_matrix_test[2, 2] + conf_matrix_test[2, 1])
cat("Sensitivity: ", sensitivity_test, "\n")
## Sensitivity:  0.670412
# Specificity (True Negative Rate)
specificity_test <- conf_matrix_test[1, 1] / (conf_matrix_test[1, 1] + conf_matrix_test[1, 2])
cat("Specificity: ", specificity_test, "\n")
## Specificity:  0.8155198
  • Sensitivity (or recall): 64.7% — The model correctly predicts 64.7% of the actual crash events.

  • Specificity: 81.3% — The model correctly identifies 81.3% of non-crash events.

  • Precision: 64.7% — Among the predicted crashes, 64.7% are true positives.

  • F1-score: 50.1% — The harmonic mean of precision and recall, providing a balanced measure of the model’s performance.

Conclusion

In conclusion, the project aimed to predict the occurrence of a crash and the Crash Cost associated with it. For predicting the occurrence of a crash, the logistic regression model achieved an accuracy of 78.5%, with a precision of 64.7% and a recall of 40.8%. The F1-score was 50.1%, indicating a balanced trade-off between precision and recall. While the model performed reasonably well in terms of overall accuracy, there is still room for improvement, particularly in detecting crash occurrences more effectively.

For Crash Cost, the linear regression model provided insights with an R-squared value of 0.43. This indicates that approximately 43% of the variance in crash cost can be explained by the features used in the model. Although the model’s performance is acceptable, the relatively low R-squared suggests that additional features, such as the extent of damage, weather conditions, and other crash-related variables, could significantly improve the prediction of crash costs.

The inclusion of additional variables, like the extent of damage, would provide a more comprehensive understanding of both the likelihood of crashes and their financial implications, leading to more accurate predictions and better decision-making for risk management.