# Clear the workspace
rm(list = ls()) # Clear environment
gc() # Clear unused memory
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 530404 28.4 1180741 63.1 NA 669397 35.8
## Vcells 989609 7.6 8388608 64.0 16384 1851879 14.2
cat("\f") # Clear the console
graphics.off() # Clear all graphs
invisible({
# List of packages to load
packages <- c("reader", # importing data
"psych", # quick summary stats for data exploration
"dplyr",
"tidyr",
"scales",
"glmnet",
"caret",
"ggcorrplot",
"corrplot",
"moments",
"mice", # for imputation of missing values and vis of missing data
"stargazer", # summary stats
"vtable", # summary stats
"summarytools", # summary stats
"naniar", # for visualisation of missing data
"visdat", # for visualisation of missing data
"VIM", # for visualisation of missing data
"DataExplorer", # for visualisation of missing data
"tidyverse", # data manipulation like selecting variables
"fastDummies", # Create dummy variables using fastDummies
"corrplot", # correlation plots
"ggplot2", # graphing
"reshape",
"data.table", # reshape for graphing
"car") # vif for multicollinearity
# Load all packages
lapply(packages, library, character.only = TRUE)
})
stargazer(df_train, type = "text", title = "Summary of df_train", summary = TRUE)
##
## Summary of df
## =======================================================
## Statistic N Mean St. Dev. Min Max
## -------------------------------------------------------
## INDEX 6,528 5,156.877 2,986.613 1 10,302
## TARGET_FLAG 6,528 0.264 0.441 0 1
## TARGET_AMT 6,528 1,466.616 4,545.654 0.000 107,586.100
## KIDSDRIV 6,528 0.171 0.509 0 4
## AGE 6,525 44.853 8.651 16 81
## HOMEKIDS 6,528 0.719 1.106 0 5
## YOJ 6,153 10.510 4.104 0 23
## TRAVTIME 6,528 33.442 15.966 5 142
## TIF 6,528 5.367 4.150 1 25
## CLM_FREQ 6,528 0.799 1.159 0 5
## MVR_PTS 6,528 1.700 2.145 0 13
## CAR_AGE 6,129 8.315 5.725 -3 27
## -------------------------------------------------------
# Drop the "INDEX" column from df_train
df_train <- df_train[, !(names(df_train) %in% "INDEX")]
df_test <- df_test[, !(names(df_test) %in% "INDEX")]
# Replace all blank values with NA for easy identification
df_train[df_train == ""] <- NA
df_test[df_test == ""] <- NA
vis_dat(df_train) +
theme(
axis.text.x = element_text(size = 6, angle = 90, hjust = 1),
axis.text.y = element_text(size = 6)
)
# Filter rows where the Job column is NA and select the education column
missing_Jobs_education <- df_train %>%
filter(is.na(JOB)) %>%
select(EDUCATION)
# Display the education levels corresponding to missing Jobs
unique_education_levels <- unique(missing_Jobs_education$EDUCATION)
print(unique_education_levels)
## [1] "PhD" "Masters"
# Analyze Jobs for PhD students
phd_Jobs <- df_train %>%
filter(EDUCATION == "PhD") %>% # Replace with actual label used in your dataset
group_by(JOB) %>% # Group by Job titles
summarise(count = n()) %>% # Count occurrences of each Job
arrange(desc(count)) # Arrange in descending order
# Display the most common Jobs for PhD students
print(phd_Jobs)
## # A tibble: 6 × 2
## JOB count
## <chr> <int>
## 1 Doctor 190
## 2 <NA> 160
## 3 Manager 108
## 4 Lawyer 62
## 5 Home Maker 52
## 6 Professional 11
# Analyze Jobs for Master's students
masters_Jobs <- df_train %>%
filter(EDUCATION == "Masters") %>% # Replace with actual label used in your dataset
group_by(JOB) %>% # Group by Job titles
summarise(count = n()) %>% # Count occurrences of each Job
arrange(desc(count)) # Arrange in descending order
# Display the most common Jobs for Master's students
print(masters_Jobs)
## # A tibble: 6 × 2
## JOB count
## <chr> <int>
## 1 Lawyer 626
## 2 <NA> 259
## 3 Manager 225
## 4 Professional 151
## 5 Home Maker 69
## 6 z_Blue Collar 1
Most PhD students are doctors and most Masters students are lawyers.
df_train <- df_train %>%
mutate(JOB = case_when(
is.na(JOB) & EDUCATION == "PhD" ~ "Lawyer", # Replace NA with "Lawyer" for PhD
is.na(JOB) ~ "Doctor", # Replace remaining NA with "Doctor"
TRUE ~ JOB # Keep existing values if not NA
))
df_test <- df_test %>%
mutate(JOB = case_when(
is.na(JOB) & EDUCATION == "PhD" ~ "Lawyer", # Replace NA with "Lawyer" for PhD
is.na(JOB) ~ "Doctor", # Replace remaining NA with "Doctor"
TRUE ~ JOB # Keep existing values if not NA
))
summary(df_train[, c("INCOME", "HOME_VAL", "AGE", "YOJ", "CAR_AGE")])
## INCOME HOME_VAL AGE YOJ
## Length:6528 Length:6528 Min. :16.00 Min. : 0.00
## Class :character Class :character 1st Qu.:39.00 1st Qu.: 9.00
## Mode :character Mode :character Median :45.00 Median :11.00
## Mean :44.85 Mean :10.51
## 3rd Qu.:51.00 3rd Qu.:13.00
## Max. :81.00 Max. :23.00
## NA's :3 NA's :375
## CAR_AGE
## Min. :-3.000
## 1st Qu.: 1.000
## Median : 8.000
## Mean : 8.315
## 3rd Qu.:12.000
## Max. :27.000
## NA's :399
# Convert Income and House_Worth to numeric, removing any non-numeric characters
df_train <- df_train %>%
mutate(
INCOME = as.integer(gsub("[^0-9.]", "", INCOME)),
HOME_VAL = as.integer(gsub("[^0-9.]", "", HOME_VAL))
)
# Perform the imputation
imputed_data <- mice(df_train, m = 5, method = 'pmm', maxit = 10, seed = 123)
# Complete the dataset with imputed values
df_train <- complete(imputed_data)
# Convert Income and House_Worth to numeric, removing any non-numeric characters
df_test <- df_test %>%
mutate(
INCOME = as.integer(gsub("[^0-9.]", "", INCOME)),
HOME_VAL = as.integer(gsub("[^0-9.]", "", HOME_VAL))
)
# Perform the imputation
imputed_data <- mice(df_test, m = 5, method = 'pmm', maxit = 10, seed = 123)
# Complete the dataset with imputed values
df_test <- complete(imputed_data)
Now the dataset does not have any more missing values
df_train$SEX <- dplyr::recode(df_train$SEX,
"M" = "Male",
"z_F" = "Female"
)
df_train$CAR_TYPE <- dplyr::recode(df_train$CAR_TYPE,
"z_SUV" = "SUV"
)
df_train$EDUCATION <- dplyr::recode(df_train$EDUCATION,
"z_High School" = "High School",
"<High School" = "Before HS"
)
df_train$MSTATUS <- dplyr::recode(df_train$MSTATUS,
"z_No" = "No"
)
df_train$RED_CAR <- dplyr::recode(df_train$RED_CAR,
"yes" = "Yes",
"no" = "No"
)
df_train$URBANICITY <- dplyr::recode(df_train$URBANICITY,
"Highly Urban/ Urban" = "Urban",
"z_Highly Rural/ Rural" = "Rural"
)
df_train$JOB <- dplyr::recode(df_train$JOB,
"z_Blue Collar" = "Blue Collar")
# Renaming the col names for easy read
df_train <- df_train |>
dplyr::rename(Crash_Flag = TARGET_FLAG,
Crash_Cost = TARGET_AMT,
Teen_Driver = KIDSDRIV,
Child_count = HOMEKIDS,
Age = AGE,
Income = INCOME,
Job_Tenure = YOJ,
House_Worth = HOME_VAL,
Marital_Status = MSTATUS,
Sex = SEX,
Job = JOB,
Car_Purpose = CAR_USE,
Bluebook = BLUEBOOK,
Time_in_Force = TIF,
Car_Type = CAR_TYPE,
Red_Car = RED_CAR,
Prev_Claims = OLDCLAIM,
Education = EDUCATION,
Single_Parent = PARENT1,
Revoked = REVOKED,
Record_Points = MVR_PTS,
Car_Age = CAR_AGE,
Urbanicity = URBANICITY,
Claim_freq = CLM_FREQ,
Travel_Time = TRAVTIME
)
# Display the structure of df_train to check data types
str(df_train)
## 'data.frame': 6528 obs. of 25 variables:
## $ Crash_Flag : int 0 0 0 0 0 0 0 1 0 0 ...
## $ Crash_Cost : num 0 0 0 0 0 ...
## $ Teen_Driver : int 0 0 0 0 0 2 0 0 0 0 ...
## $ Age : int 40 48 51 56 51 45 65 56 53 49 ...
## $ Child_count : int 0 0 0 0 0 2 0 0 1 0 ...
## $ Job_Tenure : int 9 0 14 10 9 11 15 13 10 12 ...
## $ Income : int 78658 0 89606 199336 161325 102264 68304 44320 74826 61950 ...
## $ Single_Parent : chr "No" "No" "No" "No" ...
## $ House_Worth : int 236005 75688 288266 501076 429578 0 253131 166535 237354 0 ...
## $ Marital_Status: chr "No" "Yes" "No" "Yes" ...
## $ Sex : chr "Female" "Female" "Female" "Female" ...
## $ Education : chr "Masters" "Masters" "Bachelors" "PhD" ...
## $ Job : chr "Lawyer" "Home Maker" "Clerical" "Lawyer" ...
## $ Travel_Time : int 69 39 32 23 46 14 39 48 51 26 ...
## $ Car_Purpose : chr "Private" "Private" "Private" "Commercial" ...
## $ Bluebook : chr "$17,040" "$1,500" "$10,460" "$41,280" ...
## $ Time_in_Force : int 4 6 1 1 6 3 4 1 10 1 ...
## $ Car_Type : chr "Pickup" "Sports Car" "SUV" "Van" ...
## $ Red_Car : chr "No" "No" "No" "No" ...
## $ Prev_Claims : chr "$0" "$3,867" "$21,581" "$0" ...
## $ Claim_freq : int 0 3 2 0 0 2 0 1 0 0 ...
## $ Revoked : chr "No" "No" "Yes" "No" ...
## $ Record_Points : int 1 5 0 1 0 3 1 7 0 0 ...
## $ Car_Age : int 15 1 8 16 14 7 11 15 9 10 ...
## $ Urbanicity : chr "Rural" "Urban" "Urban" "Urban" ...
df_train$Single_Parent <- as.factor(df_train$Single_Parent)
df_train$Marital_Status <- as.factor(df_train$Marital_Status)
df_train$Sex <- as.factor(df_train$Sex)
df_train$Education <- as.factor(df_train$Education)
df_train$Job <- as.factor(df_train$Job)
df_train$Car_Purpose <- as.factor(df_train$Car_Purpose)
df_train$Bluebook <- as.factor(df_train$Bluebook)
df_train$Prev_Claims <- as.factor(df_train$Prev_Claims)
df_train$Revoked <- as.factor(df_train$Revoked)
df_train$Urbanicity <- as.factor(df_train$Urbanicity)
df_train$Car_Type <- as.factor(df_train$Car_Type)
df_train$Red_Car <- as.factor(df_train$Red_Car)
# Remove non-numeric characters and convert to integer
df_train$Bluebook <- as.integer(gsub("[^0-9]", "", df_train$Bluebook))
df_train$Prev_Claims <- as.integer(gsub("[^0-9]", "", df_train$Prev_Claims))
vis_dat(df_train) +
theme(
axis.text.x = element_text(size = 6, angle = 90, hjust = 1),
axis.text.y = element_text(size = 6)
)
# Converting RED_CAR and REVOKED to binary
df_train$Red_Car <- ifelse(df_train$Red_Car == "Yes", 1, 0)
df_train$Revoked <- ifelse(df_train$Revoked == "Yes", 1, 0)
df_train$Marital_Status <- ifelse(df_train$Marital_Status == "Yes", 1, 0)
df_train$Single_Parent <- ifelse(df_train$Single_Parent == "Yes", 1, 0)
stargazer(df_train, type = "text", title = "Summary of df_train", summary = TRUE)
##
## Summary of df
## ==============================================================
## Statistic N Mean St. Dev. Min Max
## --------------------------------------------------------------
## Crash_Flag 6,528 0.264 0.441 0 1
## Crash_Cost 6,528 1,466.616 4,545.654 0.000 107,586.100
## Teen_Driver 6,528 0.171 0.509 0 4
## Age 6,528 44.847 8.654 16 81
## Child_count 6,528 0.719 1.106 0 5
## Job_Tenure 6,528 10.502 4.116 0 23
## Income 6,528 61,652.100 47,643.340 0 367,030
## Single_Parent 6,528 0.131 0.338 0 1
## House_Worth 6,528 154,219.100 129,179.100 0 885,282
## Marital_Status 6,528 0.600 0.490 0 1
## Travel_Time 6,528 33.442 15.966 5 142
## Bluebook 6,528 15,641.760 8,381.489 1,500 69,740
## Time_in_Force 6,528 5.367 4.150 1 25
## Red_Car 6,528 0.292 0.455 0 1
## Prev_Claims 6,528 4,119.320 8,924.665 0 57,037
## Claim_freq 6,528 0.799 1.159 0 5
## Revoked 6,528 0.127 0.333 0 1
## Record_Points 6,528 1.700 2.145 0 13
## Car_Age 6,528 8.298 5.742 -3 27
## --------------------------------------------------------------
train_f <- dplyr::select(.data = df_train)
# Check the levels of EDUCATION in the training data
levels(df_train$Education)
## [1] "Bachelors" "Before HS" "High School" "Masters" "PhD"
df_train$Education <- factor(df_train$Education, levels = c("Before HS", "High School", "Bachelors", "Masters", "PhD"))
train_f<- dummy_cols(df_train,
select_columns = c("Education", "Job", "Urbanicity", "Car_Type", "Sex", "Car_Purpose"),
remove_first_dummy = TRUE,
remove_selected_columns = TRUE)
colnames(train_f) <- gsub(" ", "_", colnames(train_f))
stargazer(train_f, type = "text", title = "Summary of df_train", summary = TRUE)
##
## Summary of df
## =====================================================================
## Statistic N Mean St. Dev. Min Max
## ---------------------------------------------------------------------
## Crash_Flag 6,528 0.264 0.441 0 1
## Crash_Cost 6,528 1,466.616 4,545.654 0.000 107,586.100
## Teen_Driver 6,528 0.171 0.509 0 4
## Age 6,528 44.847 8.654 16 81
## Child_count 6,528 0.719 1.106 0 5
## Job_Tenure 6,528 10.502 4.116 0 23
## Income 6,528 61,652.100 47,643.340 0 367,030
## Single_Parent 6,528 0.131 0.338 0 1
## House_Worth 6,528 154,219.100 129,179.100 0 885,282
## Marital_Status 6,528 0.600 0.490 0 1
## Travel_Time 6,528 33.442 15.966 5 142
## Bluebook 6,528 15,641.760 8,381.489 1,500 69,740
## Time_in_Force 6,528 5.367 4.150 1 25
## Red_Car 6,528 0.292 0.455 0 1
## Prev_Claims 6,528 4,119.320 8,924.665 0 57,037
## Claim_freq 6,528 0.799 1.159 0 5
## Revoked 6,528 0.127 0.333 0 1
## Record_Points 6,528 1.700 2.145 0 13
## Car_Age 6,528 8.298 5.742 -3 27
## Education_High_School 6,528 0.285 0.451 0 1
## Education_Bachelors 6,528 0.273 0.445 0 1
## Education_Masters 6,528 0.204 0.403 0 1
## Education_PhD 6,528 0.089 0.285 0 1
## Job_Clerical 6,528 0.161 0.368 0 1
## Job_Doctor 6,528 0.069 0.253 0 1
## Job_Home_Maker 6,528 0.078 0.269 0 1
## Job_Lawyer 6,528 0.130 0.336 0 1
## Job_Manager 6,528 0.116 0.320 0 1
## Job_Professional 6,528 0.136 0.343 0 1
## Job_Student 6,528 0.087 0.282 0 1
## Urbanicity_Urban 6,528 0.794 0.405 0 1
## Car_Type_Panel_Truck 6,528 0.081 0.272 0 1
## Car_Type_Pickup 6,528 0.169 0.375 0 1
## Car_Type_Sports_Car 6,528 0.110 0.312 0 1
## Car_Type_SUV 6,528 0.281 0.449 0 1
## Car_Type_Van 6,528 0.092 0.288 0 1
## Sex_Male 6,528 0.465 0.499 0 1
## Car_Purpose_Private 6,528 0.631 0.483 0 1
## ---------------------------------------------------------------------
summarytools::descr(train_f)
## Descriptive Statistics
## train_f
## N: 6528
##
## Age Bluebook Car_Age Car_Purpose_Private Car_Type_Panel_Truck
## ----------------- --------- ---------- --------- --------------------- ----------------------
## Mean 44.85 15641.76 8.30 0.63 0.08
## Std.Dev 8.65 8381.49 5.74 0.48 0.27
## Min 16.00 1500.00 -3.00 0.00 0.00
## Q1 39.00 9275.00 1.00 0.00 0.00
## Median 45.00 14370.00 8.00 1.00 0.00
## Q3 51.00 20755.00 12.00 1.00 0.00
## Max 81.00 69740.00 27.00 1.00 1.00
## MAD 8.90 8369.28 7.41 0.00 0.00
## IQR 12.00 11475.00 11.00 1.00 0.00
## CV 0.19 0.54 0.69 0.76 3.38
## Skewness -0.04 0.82 0.29 -0.54 3.08
## SE.Skewness 0.03 0.03 0.03 0.03 0.03
## Kurtosis -0.05 0.92 -0.75 -1.70 7.50
## N.Valid 6528.00 6528.00 6528.00 6528.00 6528.00
## Pct.Valid 100.00 100.00 100.00 100.00 100.00
##
## Table: Table continues below
##
##
##
## Car_Type_Pickup Car_Type_Sports_Car Car_Type_SUV Car_Type_Van
## ----------------- ----------------- --------------------- -------------- --------------
## Mean 0.17 0.11 0.28 0.09
## Std.Dev 0.38 0.31 0.45 0.29
## Min 0.00 0.00 0.00 0.00
## Q1 0.00 0.00 0.00 0.00
## Median 0.00 0.00 0.00 0.00
## Q3 0.00 0.00 1.00 0.00
## Max 1.00 1.00 1.00 1.00
## MAD 0.00 0.00 0.00 0.00
## IQR 0.00 0.00 1.00 0.00
## CV 2.21 2.85 1.60 3.15
## Skewness 1.76 2.50 0.98 2.83
## SE.Skewness 0.03 0.03 0.03 0.03
## Kurtosis 1.11 4.25 -1.05 6.01
## N.Valid 6528.00 6528.00 6528.00 6528.00
## Pct.Valid 100.00 100.00 100.00 100.00
##
## Table: Table continues below
##
##
##
## Child_count Claim_freq Crash_Cost Crash_Flag Education_Bachelors
## ----------------- ------------- ------------ ------------ ------------ ---------------------
## Mean 0.72 0.80 1466.62 0.26 0.27
## Std.Dev 1.11 1.16 4545.65 0.44 0.45
## Min 0.00 0.00 0.00 0.00 0.00
## Q1 0.00 0.00 0.00 0.00 0.00
## Median 0.00 0.00 0.00 0.00 0.00
## Q3 1.00 2.00 1067.00 1.00 1.00
## Max 5.00 5.00 107586.14 1.00 1.00
## MAD 0.00 0.00 0.00 0.00 0.00
## IQR 1.00 2.00 1066.00 1.00 1.00
## CV 1.54 1.45 3.10 1.67 1.63
## Skewness 1.32 1.21 9.12 1.07 1.02
## SE.Skewness 0.03 0.03 0.03 0.03 0.03
## Kurtosis 0.56 0.31 128.02 -0.85 -0.96
## N.Valid 6528.00 6528.00 6528.00 6528.00 6528.00
## Pct.Valid 100.00 100.00 100.00 100.00 100.00
##
## Table: Table continues below
##
##
##
## Education_High_School Education_Masters Education_PhD House_Worth
## ----------------- ----------------------- ------------------- --------------- -------------
## Mean 0.28 0.20 0.09 154219.14
## Std.Dev 0.45 0.40 0.29 129179.07
## Min 0.00 0.00 0.00 0.00
## Q1 0.00 0.00 0.00 0.00
## Median 0.00 0.00 0.00 160482.00
## Q3 1.00 0.00 0.00 237822.50
## Max 1.00 1.00 1.00 885282.00
## MAD 0.00 0.00 0.00 148269.64
## IQR 1.00 0.00 0.00 237819.25
## CV 1.58 1.98 3.19 0.84
## Skewness 0.95 1.47 2.88 0.51
## SE.Skewness 0.03 0.03 0.03 0.03
## Kurtosis -1.09 0.16 6.29 0.05
## N.Valid 6528.00 6528.00 6528.00 6528.00
## Pct.Valid 100.00 100.00 100.00 100.00
##
## Table: Table continues below
##
##
##
## Income Job_Clerical Job_Doctor Job_Home_Maker Job_Lawyer
## ----------------- ----------- -------------- ------------ ---------------- ------------
## Mean 61652.10 0.16 0.07 0.08 0.13
## Std.Dev 47643.34 0.37 0.25 0.27 0.34
## Min 0.00 0.00 0.00 0.00 0.00
## Q1 27661.00 0.00 0.00 0.00 0.00
## Median 53276.00 0.00 0.00 0.00 0.00
## Q3 85637.00 0.00 0.00 0.00 0.00
## Max 367030.00 1.00 1.00 1.00 1.00
## MAD 41807.84 0.00 0.00 0.00 0.00
## IQR 57960.00 0.00 0.00 0.00 0.00
## CV 0.77 2.28 3.68 3.43 2.59
## Skewness 1.22 1.84 3.41 3.14 2.20
## SE.Skewness 0.03 0.03 0.03 0.03 0.03
## Kurtosis 2.28 1.38 9.61 7.83 2.85
## N.Valid 6528.00 6528.00 6528.00 6528.00 6528.00
## Pct.Valid 100.00 100.00 100.00 100.00 100.00
##
## Table: Table continues below
##
##
##
## Job_Manager Job_Professional Job_Student Job_Tenure Marital_Status
## ----------------- ------------- ------------------ ------------- ------------ ----------------
## Mean 0.12 0.14 0.09 10.50 0.60
## Std.Dev 0.32 0.34 0.28 4.12 0.49
## Min 0.00 0.00 0.00 0.00 0.00
## Q1 0.00 0.00 0.00 9.00 0.00
## Median 0.00 0.00 0.00 11.00 1.00
## Q3 0.00 0.00 0.00 13.00 1.00
## Max 1.00 1.00 1.00 23.00 1.00
## MAD 0.00 0.00 0.00 2.97 0.00
## IQR 0.00 0.00 0.00 4.00 1.00
## CV 2.76 2.52 3.24 0.39 0.82
## Skewness 2.40 2.12 2.93 -1.19 -0.41
## SE.Skewness 0.03 0.03 0.03 0.03 0.03
## Kurtosis 3.74 2.52 6.59 1.13 -1.83
## N.Valid 6528.00 6528.00 6528.00 6528.00 6528.00
## Pct.Valid 100.00 100.00 100.00 100.00 100.00
##
## Table: Table continues below
##
##
##
## Prev_Claims Record_Points Red_Car Revoked Sex_Male Single_Parent
## ----------------- ------------- --------------- --------- --------- ---------- ---------------
## Mean 4119.32 1.70 0.29 0.13 0.46 0.13
## Std.Dev 8924.67 2.14 0.45 0.33 0.50 0.34
## Min 0.00 0.00 0.00 0.00 0.00 0.00
## Q1 0.00 0.00 0.00 0.00 0.00 0.00
## Median 0.00 1.00 0.00 0.00 0.00 0.00
## Q3 4663.50 3.00 1.00 0.00 1.00 0.00
## Max 57037.00 13.00 1.00 1.00 1.00 1.00
## MAD 0.00 1.48 0.00 0.00 0.00 0.00
## IQR 4663.25 3.00 1.00 0.00 1.00 0.00
## CV 2.17 1.26 1.56 2.62 1.07 2.57
## Skewness 3.07 1.33 0.91 2.24 0.14 2.19
## SE.Skewness 0.03 0.03 0.03 0.03 0.03 0.03
## Kurtosis 9.40 1.30 -1.17 3.01 -1.98 2.78
## N.Valid 6528.00 6528.00 6528.00 6528.00 6528.00 6528.00
## Pct.Valid 100.00 100.00 100.00 100.00 100.00 100.00
##
## Table: Table continues below
##
##
##
## Teen_Driver Time_in_Force Travel_Time Urbanicity_Urban
## ----------------- ------------- --------------- ------------- ------------------
## Mean 0.17 5.37 33.44 0.79
## Std.Dev 0.51 4.15 15.97 0.40
## Min 0.00 1.00 5.00 0.00
## Q1 0.00 1.00 22.00 1.00
## Median 0.00 4.00 33.00 1.00
## Q3 0.00 7.00 44.00 1.00
## Max 4.00 25.00 142.00 1.00
## MAD 0.00 4.45 16.31 0.00
## IQR 0.00 6.00 22.00 0.00
## CV 2.98 0.77 0.48 0.51
## Skewness 3.30 0.88 0.46 -1.45
## SE.Skewness 0.03 0.03 0.03 0.03
## Kurtosis 11.23 0.42 0.74 0.11
## N.Valid 6528.00 6528.00 6528.00 6528.00
## Pct.Valid 100.00 100.00 100.00 100.00
As we see here,
Central Tendency:
The Age
column has a mean of 44.85, indicating that
the average age is around mid-40s.
Income
has a mean of 61,652.10, showing the average
income for individuals in the dataset.
Claim_Freq
has a mean of 0.80, implying that most
individuals claim insurance infrequently.
Dispersion (Spread):
The standard deviations (e.g., 8.65 for Age
and
47,643.34 for Income
) highlight how widely the data points
spread around the mean. For Inome
, the spread is quite
large, indicating substantial variation in income levels.
Car_Age
has a relatively smaller standard deviation
(5.74), indicating that car ages are more clustered around the
mean.
Skewness & Kurtosis:
Columns like Claim_Freq
,
Car_Purpose_Private
, and Car_Type_SUV
exhibit
high skewness, indicating that the data are not symmetrically
distributed. For example, Claim_Freq
has a skewness of
1.21, suggesting that most individuals have lower claim frequencies,
with a few having higher frequencies.
Columns like Car_Type_SUV
and
Record_Points
show skewness values greater than 1,
indicating a long right tail. The kurtosis values reflect a higher peak
or more extreme data compared to normal distribution.
Outliers and Extremes:
Claim_Freq
shows a maximum of 5, with some extreme
outliers in the Crash_Cost
column (max value of
107,586.14).
Income
has a high kurtosis (2.28), reflecting some
outliers or a heavy-tailed distribution, particularly with a large range
from 0 to 367,030.
Categorical Data Insights:
Columns like Car_Type_Panel_Truck
,
Education_PhD
, and Job_Clerical
show
relatively low means, implying fewer individuals in these
categories.
Sex_Male
shows a 46% male representation, which is
close to a balanced gender distribution in the dataset.
Some variables show that there are high-end cars and homes that skew the data significantly.
# Shift data to long format
data_long <- melt(df_train)
# Histogram plots of each variable
ggplot(data_long) +
geom_histogram(aes(x = value, y = ..density..), bins = 30, fill = 'lightgray', color = 'black') +
geom_density(aes(x = value), color = 'blue') +
facet_wrap(. ~ variable, scales = 'free', ncol = 5) +
theme_minimal() +
theme(
strip.text = element_text(size = 6), # Adjust size of facet titles
axis.text.x = element_text(size = 6, angle = 45, hjust = 1),
axis.text.y = element_text(size = 5, hjust = 1),
plot.margin = margin(5, 5, 10, 5) # Increase margins to prevent cutting off text
) +
scale_y_continuous(labels = scales::comma) +
scale_x_continuous(labels = scales::comma) +
labs(title = "Histogram and Density Plots for Each Variable in df_train",
x = "Value",
y = "Density")
ggplot(df_train, aes(x = Sex, fill = factor(Crash_Flag))) +
geom_bar(position = "fill") +
labs(title = "Proportion of Car Crashes by Gender",
x = "Gender",
y = "Proportion",
fill = "Car Crash (1 = Yes, 0 = No)") +
scale_y_continuous(labels = scales::percent) +
theme_minimal()
ggplot(df_train, aes(x = factor(Marital_Status), fill = factor(Crash_Flag))) +
geom_bar(position = "fill") +
labs(title = "Proportion of Car Crashes by Marital Status",
x = "Marital Status",
y = "Proportion",
fill = "Car Crash (1 = Yes, 0 = No)") +
scale_y_continuous(labels = scales::percent) +
scale_x_discrete(labels = c("0" = "Not married", "1" = "Married")) + # Change 0 to Single and 1 to Married
theme_minimal()
# Gather Job columns into long format
Jobs_long <- train_f %>%
pivot_longer(cols = starts_with("Job_") & !matches("Job_Tenure"), # Identify Job columns
names_to = "Job_Type", # New column for Job type
values_to = "Is_Job") %>% # Column for Job presence (1 if present)
filter(Is_Job == 1)
ggplot(Jobs_long, aes(x = factor(Job_Type), fill = factor(Crash_Flag))) +
geom_bar(position = "fill") +
labs(title = "Proportion of Car Crashes by Job Type",
x = "Job Type",
y = "Proportion of Car Crashes",
fill = "Car Crash (1 = Yes, 0 = No)") +
scale_y_continuous(breaks = c(0, 0.25, 0.5, 0.75, 1)) + # Specify the breaks
scale_x_discrete(labels = c( # Custom x-axis labels, excluding "Job_Tenure"
"Job_Clerical" = "Clerical",
"Job_Doctor" = "Doctor",
"Job_Home_Maker" = "Home Maker",
"Job_Lawyer" = "Lawyer",
"Job_Manager" = "Manager",
"Job_Professional" = "Professional",
"Job_Student" = "Student"
)) +
theme_minimal() +
coord_flip()
ggplot(df_train, aes(x = factor(Car_Type), y = Crash_Cost)) +
geom_boxplot() +
labs(title = "Claim Amount by Car Type",
x = "Car Type",
y = "Claim Amount ($)") +
theme_minimal() +
scale_y_continuous(labels = scales::dollar_format())
ggplot(train_f, aes(x = factor(Urbanicity_Urban), fill = factor(Crash_Flag))) +
geom_bar(position = "fill") +
labs(title = "Proportion of Car Crashes by Urbanicity",
x = "Urbanicity",
y = "Proportion",
fill = "Car Crash (1 = Yes, 0 = No)") +
scale_y_continuous(labels = scales::percent) +
scale_x_discrete(labels = c("0" = "Rural", "1" = "Urban")) + # Change 0 to Rural and 1 to Urban
theme_minimal()
# Bucket the AGE variable
train_f$AGE_GROUP <- cut(train_f$Age, breaks = c(0, 25, 45, 65, Inf), labels = c("Young", "Adult", "Middle-Aged", "Senior"))
# Bucket the Income variable
train_f$Income_GROUP <- cut(train_f$Income, breaks = c(-Inf, 30000, 60000, 100000, Inf), labels = c("Low", "Medium", "High", "Very High"))
# Plotting the proportion of car crashes by age bucket
ggplot(train_f, aes(x = AGE_GROUP, fill = factor(Crash_Flag))) +
geom_bar(position = "fill") + # Stacked bar plot showing proportions
labs(title = "Proportion of Car Crashes by Age Bucket",
x = "Age Group",
y = "Proportion of Car Crashes",
fill = "Car Crash (1 = Yes, 0 = No)") +
scale_y_continuous(labels = scales::percent) + # Use percentage labels
theme_minimal()
# Plotting the proportion of car crashes by Income bucket
ggplot(train_f, aes(x = Income_GROUP, fill = factor(Crash_Flag))) +
geom_bar(position = "fill") + # Stacked bar plot showing proportions
labs(title = "Proportion of Car Crashes by Income Bucket",
x = "Income Group",
y = "Proportion of Car Crashes",
fill = "Car Crash (1 = Yes, 0 = No)") +
scale_y_continuous(labels = scales::percent) + # Use percentage labels
theme_minimal()
continuous_vars <- train_f[, c("Income", "Crash_Cost", "Bluebook", "Age", "House_Worth", "Prev_Claims")]
corr_matrix <- cor(continuous_vars, use = "complete.obs")
corrplot(corr_matrix, method = "color", type = "upper", tl.col = "black", tl.srt = 45)
# Count occurrences of each Crash_Flag value
Crash_Flag_counts <- table(train_f$Crash_Flag)
# Convert the table to a data frame and create the pie chart
ggplot(as.data.frame(Crash_Flag_counts), aes(x = "", y = Freq, fill = factor(Var1))) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y") +
labs(title = "Proportion of Crashes (Crash Flag)", fill = "Crash Status") +
scale_fill_manual(values = c("skyblue", "lightcoral"),
labels = c("No Crash", "Crash Occurred")) + # Change legend labels
theme_void()
The dataset is heavily imbalanced with lower number of crash occurred instances recorded.
# Dropping columns by name
train_f <- train_f[, !names(train_f) %in% c("Income_GROUP", "AGE_GROUP")]
# Select only numeric columns excluding 'INDEX'
numeric_train <- df_train[sapply(df_train, is.numeric)]
# Remove the 'INDEX' column if it exists
numeric_train <- numeric_train[, !colnames(numeric_train) %in% "INDEX"]
# Calculate the correlation matrix
cor_matrix <- cor(numeric_train, use = "complete.obs")
# Calculate the p-values for significance testing
p.mat <- ggcorrplot::cor_pmat(numeric_train)
# Create the correlation plot
ggcorrplot(corr = cor_matrix,
method = "square", # Shape of the plot
type = "lower", # Display the upper triangle
title = "Correlation Plot", # Title of the plot
colors = c("red", "white", "green"), # Color scheme
lab = TRUE, # Display correlation coefficients
lab_size = 2, # Size of the labels
p.mat = p.mat, # Add p-values for significance
insig = "pch", # Mark insignificant correlations with a symbol
pch = 4, # Use a cross as the symbol
hc.order = TRUE, # Hierarchically cluster the variables
tl.cex = 10, # Adjust size of text labels
tl.col = "black", # Color of text labels
digits = 2) # Number of digits to display
Income
and House_Worth
are strongly
positively correlated which makes sense as they are proportionally
related
Crash_Flag
is positively correlated with
Record_Points
, and Teen_Driver
which may
suggest that people with high number of tickets or underage driving
(most 16 y/o) tend to crash cars more.
Single_Parent
are negatively correlated with
Claim_freq
suggesting they might be careful
drivers.
# Model 1: Multiple Linear Regression with all variables
model1 <- lm(Crash_Cost ~ . -Crash_Flag, data = train_f) # Use all variables as predictors
summary(model1)
##
## Call:
## lm(formula = Crash_Cost ~ . - Crash_Flag, data = train_f)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5439 -1646 -728 380 103905
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.000e+01 5.016e+02 0.179 0.857594
## Teen_Driver 3.638e+02 1.230e+02 2.957 0.003118 **
## Age 5.352e+00 7.640e+00 0.701 0.483628
## Child_count 5.888e+01 7.124e+01 0.826 0.408565
## Job_Tenure -1.642e+01 1.572e+01 -1.045 0.296172
## Income -4.273e-03 1.923e-03 -2.222 0.026335 *
## Single_Parent 6.539e+02 2.181e+02 2.998 0.002729 **
## House_Worth -3.431e-04 6.288e-04 -0.546 0.585327
## Marital_Status -5.010e+02 1.562e+02 -3.207 0.001350 **
## Travel_Time 1.100e+01 3.469e+00 3.169 0.001536 **
## Bluebook 1.640e-02 9.268e-03 1.769 0.076934 .
## Time_in_Force -4.209e+01 1.315e+01 -3.201 0.001377 **
## Red_Car -1.090e+01 1.607e+02 -0.068 0.945953
## Prev_Claims -1.559e-02 7.854e-03 -1.985 0.047192 *
## Claim_freq 1.767e+02 5.932e+01 2.978 0.002908 **
## Revoked 6.438e+02 1.834e+02 3.511 0.000450 ***
## Record_Points 1.767e+02 2.805e+01 6.300 3.17e-10 ***
## Car_Age -2.031e+01 1.320e+01 -1.539 0.123868
## Education_High_School -1.275e+02 1.841e+02 -0.692 0.488705
## Education_Bachelors -2.925e+02 2.180e+02 -1.342 0.179749
## Education_Masters 2.247e+01 3.177e+02 0.071 0.943634
## Education_PhD 3.261e+01 3.646e+02 0.089 0.928745
## Job_Clerical -2.558e+01 2.055e+02 -0.124 0.900950
## Job_Doctor -5.948e+02 3.437e+02 -1.731 0.083550 .
## Job_Home_Maker -1.554e+02 2.862e+02 -0.543 0.587124
## Job_Lawyer -3.010e+02 3.155e+02 -0.954 0.340141
## Job_Manager -8.948e+02 2.551e+02 -3.508 0.000455 ***
## Job_Professional -2.487e+01 2.301e+02 -0.108 0.913927
## Job_Student -1.944e+02 2.505e+02 -0.776 0.437772
## Urbanicity_Urban 1.618e+03 1.500e+02 10.784 < 2e-16 ***
## Car_Type_Panel_Truck 2.199e+02 2.980e+02 0.738 0.460502
## Car_Type_Pickup 2.949e+02 1.839e+02 1.604 0.108824
## Car_Type_Sports_Car 9.439e+02 2.353e+02 4.012 6.08e-05 ***
## Car_Type_SUV 7.778e+02 1.931e+02 4.027 5.71e-05 ***
## Car_Type_Van 2.812e+02 2.294e+02 1.226 0.220423
## Sex_Male 4.152e+02 1.980e+02 2.097 0.036021 *
## Car_Purpose_Private -8.125e+02 1.722e+02 -4.717 2.44e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4387 on 6491 degrees of freedom
## Multiple R-squared: 0.07359, Adjusted R-squared: 0.06845
## F-statistic: 14.32 on 36 and 6491 DF, p-value: < 2.2e-16
The R-squared value of 0.07359 indicates that only about 7.4% of the variance in crash costs is explained by the model, which is quite low. This suggests that while some variables like age, marital status, and car type show significant relationships, the model is missing important factors that could better explain crash costs.
# Log transformation of the Crash_Cost column
train_f$log_Crash_Cost <- log(train_f$Crash_Cost + 1)
# Running Multiple Linear Regression with selected impactful variables
model2_refined <- lm(log_Crash_Cost ~ . -Crash_Flag -Crash_Cost,
data = train_f)
summary(model2_refined)
##
## Call:
## lm(formula = log_Crash_Cost ~ . - Crash_Flag - Crash_Cost, data = train_f)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.6006 -2.3272 -0.8779 2.1137 10.3352
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.182e+00 3.691e-01 3.203 0.001368 **
## Teen_Driver 5.822e-01 9.055e-02 6.430 1.37e-10 ***
## Age -1.620e-03 5.623e-03 -0.288 0.773296
## Child_count 5.930e-02 5.244e-02 1.131 0.258156
## Job_Tenure -2.935e-02 1.157e-02 -2.536 0.011227 *
## Income -4.436e-06 1.415e-06 -3.134 0.001731 **
## Single_Parent 6.474e-01 1.605e-01 4.033 5.57e-05 ***
## House_Worth -9.006e-07 4.628e-07 -1.946 0.051687 .
## Marital_Status -6.251e-01 1.150e-01 -5.437 5.62e-08 ***
## Travel_Time 1.519e-02 2.554e-03 5.949 2.84e-09 ***
## Bluebook -1.692e-05 6.821e-06 -2.481 0.013134 *
## Time_in_Force -6.520e-02 9.678e-03 -6.737 1.75e-11 ***
## Red_Car -3.063e-02 1.183e-01 -0.259 0.795724
## Prev_Claims -2.125e-05 5.780e-06 -3.676 0.000239 ***
## Claim_freq 2.681e-01 4.366e-02 6.141 8.68e-10 ***
## Revoked 1.311e+00 1.350e-01 9.716 < 2e-16 ***
## Record_Points 1.853e-01 2.064e-02 8.979 < 2e-16 ***
## Car_Age 1.188e-03 9.715e-03 0.122 0.902691
## Education_High_School 3.441e-02 1.355e-01 0.254 0.799538
## Education_Bachelors -5.328e-01 1.605e-01 -3.320 0.000904 ***
## Education_Masters -1.784e-01 2.339e-01 -0.763 0.445671
## Education_PhD -2.713e-01 2.683e-01 -1.011 0.311991
## Job_Clerical -2.478e-03 1.512e-01 -0.016 0.986926
## Job_Doctor -8.287e-01 2.530e-01 -3.276 0.001059 **
## Job_Home_Maker -2.056e-01 2.106e-01 -0.976 0.329095
## Job_Lawyer -5.853e-01 2.322e-01 -2.521 0.011738 *
## Job_Manager -1.183e+00 1.877e-01 -6.303 3.11e-10 ***
## Job_Professional -2.707e-01 1.694e-01 -1.599 0.109959
## Job_Student -1.931e-01 1.843e-01 -1.048 0.294904
## Urbanicity_Urban 2.415e+00 1.104e-01 21.872 < 2e-16 ***
## Car_Type_Panel_Truck 4.082e-01 2.193e-01 1.861 0.062748 .
## Car_Type_Pickup 6.251e-01 1.354e-01 4.618 3.95e-06 ***
## Car_Type_Sports_Car 1.180e+00 1.732e-01 6.813 1.04e-11 ***
## Car_Type_SUV 9.671e-01 1.422e-01 6.803 1.12e-11 ***
## Car_Type_Van 5.661e-01 1.689e-01 3.353 0.000805 ***
## Sex_Male 2.046e-01 1.457e-01 1.404 0.160344
## Car_Purpose_Private -9.968e-01 1.268e-01 -7.863 4.37e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.229 on 6491 degrees of freedom
## Multiple R-squared: 0.2286, Adjusted R-squared: 0.2243
## F-statistic: 53.44 on 36 and 6491 DF, p-value: < 2.2e-16
Performing stepwise selection did not help too.
The updated model with a log-transformed dependent variable
(log_Crash_Cost
) shows an improved R-squared value of
0.2286, meaning that about 23% of the variation in crash costs is
explained by the model. This is a significant improvement compared to
the previous model. Some of the key variables, like
Teen_Driver
, Job_Tenure
, Income
,
and Claim_freq
, show meaningful relationships. For example,
Teen_Driver
has a positive coefficient, meaning that teen
drivers tend to have higher crash costs, while Income
has a
negative coefficient, suggesting that higher income is associated with
lower crash costs.
Since the dependent variable is log-transformed, the coefficients represent the percentage change in the crash cost for a one-unit change in the predictor variables.
While the model fits the data better, there is still room for improvement.
# Define Predictors (X) and Response (y) for your dataset
X2 <- model.matrix(log_Crash_Cost ~ Bluebook + Income + Single_Parent + House_Worth + Marital_Status + Sex_Male + Education_High_School + Education_Bachelors + Education_Masters + Travel_Time + Car_Purpose_Private + Teen_Driver + Age + Child_count + Job_Tenure + Car_Type_Panel_Truck + Car_Type_Pickup + Car_Type_Sports_Car + Car_Type_SUV + Car_Type_Van + Prev_Claims + Claim_freq + Revoked + Record_Points + Urbanicity_Urban + Job_Clerical + Job_Doctor + Job_Home_Maker + Job_Lawyer + Job_Manager + Job_Professional + Job_Student + Time_in_Force,
data = train_f)[, -1] # Remove intercept column
y2 <- train_f$log_Crash_Cost
# Scale the predictor matrix
X2 <- scale(X2)
# Lasso regression with cross-validation to find the optimal lambda
cv_lasso1 <- cv.glmnet(x = X2,
y = y2,
alpha = 1,
family = "gaussian") # Use "gaussian" for continuous target
# Optimal lambda
best_lambda_lasso1 <- cv_lasso1$lambda.min
# Fitting the Lasso model with the optimal lambda
lasso_model1 <- glmnet(X2,
y2,
alpha = 1,
family = "gaussian",
lambda = best_lambda_lasso1)
# Extract coefficients at the optimal lambda
lasso_coefs1 <- coef(lasso_model1, s = best_lambda_lasso1)
# Convert the coefficients into a df
lasso_coefs1_df <- as.data.frame(as.matrix(lasso_coefs1))
colnames(lasso_coefs1_df) <- "Coefficient"
print(lasso_coefs1_df)
## Coefficient
## (Intercept) 2.180892678
## Bluebook -0.142639893
## Income -0.231931214
## Single_Parent 0.217919720
## House_Worth -0.114877469
## Marital_Status -0.304079188
## Sex_Male 0.085193071
## Education_High_School 0.043792316
## Education_Bachelors -0.193517820
## Education_Masters -0.003567308
## Travel_Time 0.239319395
## Car_Purpose_Private -0.482550156
## Teen_Driver 0.294346368
## Age -0.016309811
## Child_count 0.064437539
## Job_Tenure -0.112701677
## Car_Type_Panel_Truck 0.103548911
## Car_Type_Pickup 0.224838050
## Car_Type_Sports_Car 0.355929652
## Car_Type_SUV 0.419783047
## Car_Type_Van 0.155829477
## Prev_Claims -0.180884594
## Claim_freq 0.306377994
## Revoked 0.432073715
## Record_Points 0.396450923
## Urbanicity_Urban 0.970942094
## Job_Clerical 0.005960880
## Job_Doctor -0.230122299
## Job_Home_Maker -0.058256710
## Job_Lawyer -0.222863295
## Job_Manager -0.388368687
## Job_Professional -0.094499605
## Job_Student -0.046058792
## Time_in_Force -0.267902190
W <- as.matrix(coef(lasso_model1))
# Identify predictors with non-zero coefficients, excluding the intercept
keep_X <- rownames(W)[W != 0 & rownames(W) != "(Intercept)"]
# Subset the original data to include only predictors with non-zero coefficients
X_O <- as.matrix(train_f[, keep_X, drop = FALSE]) # Assuming train_f is your dataset
# Combine target variable `y2` with selected predictors into a data frame
df <- cbind.data.frame(y2, X_O)
# Fit a linear model using only the selected predictors, without an intercept
la_eq <- lm(y2 ~ X_O - 1, data = df)
# Output the summary of the linear model
summary(la_eq)
##
## Call:
## lm(formula = y2 ~ X_O - 1, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.6461 -2.3156 -0.8598 2.1120 10.1235
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## X_OBluebook -1.122e-05 6.576e-06 -1.707 0.087929 .
## X_OIncome -4.190e-06 1.331e-06 -3.149 0.001646 **
## X_OSingle_Parent 7.358e-01 1.581e-01 4.655 3.31e-06 ***
## X_OHouse_Worth -9.637e-07 4.618e-07 -2.087 0.036924 *
## X_OMarital_Status -5.847e-01 1.143e-01 -5.116 3.21e-07 ***
## X_OSex_Male 3.068e-01 1.225e-01 2.505 0.012281 *
## X_OEducation_High_School 2.030e-01 1.168e-01 1.738 0.082205 .
## X_OEducation_Bachelors -3.444e-01 1.235e-01 -2.788 0.005324 **
## X_OEducation_Masters 5.844e-02 1.409e-01 0.415 0.678342
## X_OTravel_Time 1.721e-02 2.469e-03 6.969 3.50e-12 ***
## X_OCar_Purpose_Private -8.984e-01 1.230e-01 -7.306 3.07e-13 ***
## X_OTeen_Driver 5.512e-01 9.005e-02 6.121 9.86e-10 ***
## X_OAge 8.499e-03 4.540e-03 1.872 0.061220 .
## X_OChild_count 9.595e-02 5.119e-02 1.875 0.060887 .
## X_OJob_Tenure -2.277e-02 1.140e-02 -1.997 0.045860 *
## X_OCar_Type_Panel_Truck 4.180e-01 2.188e-01 1.911 0.056107 .
## X_OCar_Type_Pickup 7.311e-01 1.315e-01 5.559 2.82e-08 ***
## X_OCar_Type_Sports_Car 1.307e+00 1.685e-01 7.759 9.87e-15 ***
## X_OCar_Type_SUV 1.104e+00 1.357e-01 8.136 4.85e-16 ***
## X_OCar_Type_Van 6.017e-01 1.685e-01 3.571 0.000358 ***
## X_OPrev_Claims -2.132e-05 5.783e-06 -3.686 0.000230 ***
## X_OClaim_freq 2.681e-01 4.368e-02 6.138 8.83e-10 ***
## X_ORevoked 1.322e+00 1.350e-01 9.794 < 2e-16 ***
## X_ORecord_Points 1.914e-01 2.056e-02 9.309 < 2e-16 ***
## X_OUrbanicity_Urban 2.492e+00 1.075e-01 23.185 < 2e-16 ***
## X_OJob_Clerical 5.514e-02 1.496e-01 0.369 0.712445
## X_OJob_Doctor -9.640e-01 2.176e-01 -4.431 9.53e-06 ***
## X_OJob_Home_Maker -1.622e-01 1.987e-01 -0.816 0.414455
## X_OJob_Lawyer -7.232e-01 1.981e-01 -3.652 0.000263 ***
## X_OJob_Manager -1.280e+00 1.742e-01 -7.352 2.19e-13 ***
## X_OJob_Professional -3.348e-01 1.640e-01 -2.041 0.041273 *
## X_OJob_Student -7.087e-02 1.787e-01 -0.397 0.691665
## X_OTime_in_Force -6.100e-02 9.588e-03 -6.362 2.13e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.231 on 6495 degrees of freedom
## Multiple R-squared: 0.4293, Adjusted R-squared: 0.4264
## F-statistic: 148 on 33 and 6495 DF, p-value: < 2.2e-16
The updated model with a formula excluding the intercept (y2 ~ X_O - 1) results in a higher R-squared value of 0.4293, indicating that approximately 43% of the variance in the dependent variable is explained by the predictors.
Some key variables, such as Income
,
Single_Parent
, Marital_Status
,
Revoked
, and Urbanicity_Urban
, have strong and
statistically significant relationships with the outcome, as indicated
by their low p-values.
For instance,
Single_Parent
has a coeff of 0.7358. This coefficient
suggests that being a single parent increases the log-transformed value
of the dependent variable (y2) by 0.7358 units, holding all other
factors constant. Since the dependent variable is logged, this implies
that a single parent is expected to see an approximately 109.7%
increase in the value of the dependent variable. This could
reflect an economic scenario where single parents may have less access
to support systems or more demanding situations, leading to higher
financial vulnerability.
Urbanicity_Urban
has a coeff of 2.492. The coefficient
suggests that being in an urban area increases the log(crash costs) by
2.492 units, holding all other factors constant. Given that the
dependent variable is logged, this corresponds to an approximate
810.9% increase in crash costs for urban residents
compared to those in rural areas. Urban areas generally have higher
traffic density, more vehicles on the road, and increased likelihood of
accidents due to factors such as congestion, distracted driving, or
complex road networks. Therefore, it makes sense that crash costs would
be significantly higher in urban areas.
The higher R-squared value suggests a better fit compared to the previous model.
# Convert the Crash_Flag to a factor with two levels (binary classification)
train_f$Crash_Flag <- as.factor(train_f$Crash_Flag)
# Model 1: Logistic Regression with all low-correlated variables
logit_model1 <- glm(Crash_Flag ~ . -Crash_Cost -log_Crash_Cost -Education_Masters -Education_PhD -Sex_Male -Car_Type_SUV -Car_Type_Sports_Car -Job_Manager,
data = train_f, family = binomial)
summary(logit_model1)
##
## Call:
## glm(formula = Crash_Flag ~ . - Crash_Cost - log_Crash_Cost -
## Education_Masters - Education_PhD - Sex_Male - Car_Type_SUV -
## Car_Type_Sports_Car - Job_Manager, family = binomial, data = train_f)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.716e+00 2.931e-01 -5.853 4.82e-09 ***
## Teen_Driver 4.185e-01 6.842e-02 6.116 9.59e-10 ***
## Age -5.904e-04 4.432e-03 -0.133 0.894040
## Child_count 7.521e-02 4.192e-02 1.794 0.072814 .
## Job_Tenure -1.970e-02 9.204e-03 -2.141 0.032306 *
## Income -5.530e-06 1.134e-06 -4.875 1.09e-06 ***
## Single_Parent 3.645e-01 1.216e-01 2.997 0.002723 **
## House_Worth -9.401e-07 3.744e-07 -2.511 0.012027 *
## Marital_Status -5.022e-01 9.260e-02 -5.423 5.86e-08 ***
## Travel_Time 1.403e-02 2.076e-03 6.757 1.41e-11 ***
## Bluebook -3.407e-05 5.284e-06 -6.448 1.14e-10 ***
## Time_in_Force -5.579e-02 8.153e-03 -6.843 7.75e-12 ***
## Red_Car -2.408e-01 7.741e-02 -3.111 0.001865 **
## Prev_Claims -1.511e-05 4.224e-06 -3.578 0.000346 ***
## Claim_freq 2.087e-01 3.144e-02 6.640 3.15e-11 ***
## Revoked 9.465e-01 9.883e-02 9.577 < 2e-16 ***
## Record_Points 1.198e-01 1.513e-02 7.919 2.40e-15 ***
## Car_Age -1.073e-02 7.094e-03 -1.513 0.130397
## Education_High_School 8.325e-02 9.230e-02 0.902 0.367129
## Education_Bachelors -3.294e-01 9.676e-02 -3.404 0.000664 ***
## Job_Clerical 3.232e-01 1.092e-01 2.961 0.003067 **
## Job_Doctor -1.821e-01 1.615e-01 -1.128 0.259450
## Job_Home_Maker 1.747e-01 1.529e-01 1.143 0.253166
## Job_Lawyer 3.741e-02 1.337e-01 0.280 0.779713
## Job_Professional 1.462e-01 1.138e-01 1.284 0.199155
## Job_Student 2.531e-03 1.399e-01 0.018 0.985572
## Urbanicity_Urban 2.293e+00 1.257e-01 18.245 < 2e-16 ***
## Car_Type_Panel_Truck 1.796e-01 1.645e-01 1.092 0.274823
## Car_Type_Pickup 3.943e-02 9.558e-02 0.413 0.679948
## Car_Type_Van 2.051e-01 1.296e-01 1.582 0.113679
## Car_Purpose_Private -9.778e-01 9.113e-02 -10.729 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7533.1 on 6527 degrees of freedom
## Residual deviance: 5933.7 on 6497 degrees of freedom
## AIC: 5995.7
##
## Number of Fisher Scoring iterations: 5
# Calculate VIF
vif_values <- vif(logit_model1)
print(vif_values)
## Teen_Driver Age Child_count
## 1.351621 1.445736 2.195577
## Job_Tenure Income Single_Parent
## 1.448771 2.300350 1.934538
## House_Worth Marital_Status Travel_Time
## 1.915093 2.053758 1.035641
## Bluebook Time_in_Force Red_Car
## 1.716980 1.011532 1.201895
## Prev_Claims Claim_freq Revoked
## 1.633909 1.458158 1.305634
## Record_Points Car_Age Education_High_School
## 1.161424 1.570936 1.782192
## Education_Bachelors Job_Clerical Job_Doctor
## 1.732772 1.589933 1.453315
## Job_Home_Maker Job_Lawyer Job_Professional
## 1.644199 1.802142 1.374693
## Job_Student Urbanicity_Urban Car_Type_Panel_Truck
## 1.653085 1.137673 2.012575
## Car_Type_Pickup Car_Type_Van Car_Purpose_Private
## 1.334408 1.386032 1.960249
Teen_Driver
Coefficient: 0.4185
Interpretation: The coefficient suggests that being a teen driver increases the log-odds of having a crash by 0.4185 units. This corresponds to an approximately 51.8% higher odds of having a crash compared to non-teen drivers.
Economic Interpretation: Teen drivers are often considered higher risk due to inexperience, which justifies the higher odds of accidents.
Income
Coefficient: -5.530e-06
Interpretation: For each unit increase in income, the log-odds of having a crash decrease by 0.000005530. This suggests that higher income decreases the odds of a crash, but the effect is quite small.
Economic Interpretation: Higher-income individuals may have better vehicles, safer driving habits, or more resources to prevent accidents, leading to a lower likelihood of crashes.
Single_Parent
Coefficient: 0.3645
Interpretation: Being a single parent increases the log-odds of a crash by 0.3645 units, which corresponds to an approximately 43.9% increase in the odds of having a crash.
Economic Interpretation: Single parents may have additional driving stressors, such as transporting children, which could lead to a higher risk of accidents.
Travel_Time
Coefficient: 0.01403
Interpretation: For each additional unit increase in travel time, the log-odds of having a crash increase by 0.01403. This corresponds to a 1.4% increase in odds for each additional time unit of travel.
Economic Interpretation: Longer travel times often correlate with increased exposure to driving risk, such as more time spent on the road and more opportunities for accidents.
Urbanicity_Urban
Coefficient: 2.293
Interpretation: Living in an urban area increases the log-odds of a crash by 2.293 units. This corresponds to a approximate 9.91 times increase in the odds of having a crash compared to rural areas.
Economic Interpretation: Urban areas generally have higher traffic density, congestion, and more driving risks, justifying the much higher odds of accidents in urban settings.
Car_Purpose_Private
Coefficient: -0.9778
Interpretation: If the car is used for private purposes, the log-odds of having a crash decrease by 0.9778 units. This corresponds to about a 62.5% decrease in the odds of a crash compared to cars used for business purposes.
Economic Interpretation: Private use vehicles may be driven less frequently or under safer conditions compared to vehicles used for business purposes, reducing the crash odds.
train_f_lasso <- train_f[, !names(train_f) %in% c("Crash_Cost", "log_Crash_Cost")] # Exclude response and unwanted variables
X <- model.matrix(Crash_Flag ~ ., train_f_lasso)[, -1] # Convert to matrix and remove intercept column
y <- train_f$Crash_Flag # Response variable
# Set a range of lambda values for cross-validation
lambda_grid <- 10^seq(3, -3, length = 100)
# Fit the Lasso model using glmnet
lasso_model <- cv.glmnet(X, y, alpha = 1, lambda = lambda_grid, family = "binomial")
# View the best lambda value
lasso_model$lambda.min # Best lambda based on cross-validation
## [1] 0.001
# Plot the cross-validation curve
plot(lasso_model)
# Get the coefficients for the best lambda
coef(lasso_model, s = "lambda.min")
## 37 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -2.348478e+00
## Teen_Driver 4.303671e-01
## Age -1.618479e-03
## Child_count 4.656802e-02
## Job_Tenure -1.703011e-02
## Income -4.143862e-06
## Single_Parent 3.884096e-01
## House_Worth -8.750704e-07
## Marital_Status -4.946172e-01
## Travel_Time 1.280118e-02
## Bluebook -1.915300e-05
## Time_in_Force -5.429347e-02
## Red_Car .
## Prev_Claims -1.313599e-05
## Claim_freq 1.932873e-01
## Revoked 8.979760e-01
## Record_Points 1.135534e-01
## Car_Age .
## Education_High_School .
## Education_Bachelors -4.008179e-01
## Education_Masters -1.974289e-01
## Education_PhD -2.578841e-01
## Job_Clerical 8.082546e-02
## Job_Doctor -3.877766e-01
## Job_Home_Maker -6.092110e-02
## Job_Lawyer -2.087108e-01
## Job_Manager -7.663598e-01
## Job_Professional -1.062664e-01
## Job_Student -8.845932e-02
## Urbanicity_Urban 2.294023e+00
## Car_Type_Panel_Truck 3.859180e-01
## Car_Type_Pickup 4.740732e-01
## Car_Type_Sports_Car 8.930570e-01
## Car_Type_SUV 7.182656e-01
## Car_Type_Van 4.841622e-01
## Sex_Male 8.549020e-02
## Car_Purpose_Private -8.087533e-01
# Predict on the training data using the best lambda
lasso_pred <- predict(lasso_model, X, s = "lambda.min", type = "response")
Here are the key factors with significant effects on crash likelihood:
Teen_Driver (0.43): Teen drivers have a significantly higher likelihood of a crash, increasing the odds by about 43%.
Single_Parent (0.39): Single parents are more likely to be involved in crashes, with a significant positive effect.
Income (-4.14e-06): Higher income slightly decreases the likelihood of a crash.
Marital_Status (-0.49): Married individuals have a lower likelihood of being involved in a crash.
Travel_Time (0.013): Longer travel times significantly increase the likelihood of a crash.
Prev_Claims (-0.000013): Previous claims slightly reduce the likelihood of a future crash.
Revoked (0.90): Having a revoked license significantly increases the likelihood of a crash.
Urbanicity_Urban (2.29): Living in urban areas drastically increases the likelihood of a crash.
# Evaluate model performance (e.g., AUC, accuracy, etc.)
library(pROC)
roc_curve <- roc(y, lasso_pred)
plot(roc_curve, main = "ROC Curve for Lasso Model")
auc(roc_curve) # AUC value
## Area under the curve: 0.8146
# You can also calculate accuracy, confusion matrix, etc.
lasso_class <- ifelse(lasso_pred > 0.5, 1, 0)
table(Predicted = lasso_class, Actual = y) # Confusion matrix
## Actual
## Predicted 0 1
## 0 4457 998
## 1 349 724
# Calculate the confusion matrix from the predicted values
conf_matrix <- table(Predicted = lasso_class, Actual = y)
# Accuracy: Proportion of correct predictions
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
cat("Accuracy: ", accuracy, "\n")
## Accuracy: 0.7936581
# Precision: Proportion of positive predictions that are correct
precision <- conf_matrix[2, 2] / sum(conf_matrix[2, ])
cat("Precision: ", precision, "\n")
## Precision: 0.6747437
# Recall: Proportion of actual positives that are correctly identified
recall <- conf_matrix[2, 2] / sum(conf_matrix[, 2])
cat("Recall: ", recall, "\n")
## Recall: 0.4204413
# F1-Score: Harmonic mean of precision and recall
f1_score <- 2 * (precision * recall) / (precision + recall)
cat("F1-Score: ", f1_score, "\n")
## F1-Score: 0.518068
AUC (0.8146): This suggests that the model is performing reasonably well, as values above 0.8 typically indicate good discrimination between the two classes. However, there may still be room for improvement.
Confusion Matrix:
The model is predicting 998 false positives (cases where it predicted 1 but the actual value is 0), which is relatively high compared to the number of true positives (724). This suggests the model may be a bit overconfident in predicting 1s.
The number of false negatives (349) is also notable; improving the model might reduce this count (i.e., correctly identifying more of the 1s).
Sparse Coefficients:
Red_Car
,
Car_Age
, Education_High_School
, etc., were not
deemed important by the Lasso model.# Log Transformations of potentially skewed variables
train_f$log_Income <- log(train_f$Income + 1)
train_f$log_Age <- log(train_f$Age + 1)
train_f$log_House_Worth <- log(train_f$House_Worth + 1)
# Fit the Logistic Regression model with log-transformed variables
logit_model3 <- glm(Crash_Flag ~ . -Crash_Cost -log_Crash_Cost -Income -Age -House_Worth + log_Income + log_Age + log_House_Worth,
data = train_f,
family = binomial)
# Summarize the model
summary(logit_model3)
##
## Call:
## glm(formula = Crash_Flag ~ . - Crash_Cost - log_Crash_Cost -
## Income - Age - House_Worth + log_Income + log_Age + log_House_Worth,
## family = binomial, data = train_f)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.324e-01 7.838e-01 -0.807 0.419754
## Teen_Driver 4.662e-01 6.993e-02 6.667 2.61e-11 ***
## Child_count 1.141e-02 4.283e-02 0.266 0.789910
## Job_Tenure 7.090e-03 1.158e-02 0.612 0.540495
## Single_Parent 3.780e-01 1.233e-01 3.066 0.002167 **
## Marital_Status -4.633e-01 9.621e-02 -4.816 1.47e-06 ***
## Travel_Time 1.332e-02 2.110e-03 6.312 2.75e-10 ***
## Bluebook -2.092e-05 5.826e-06 -3.591 0.000330 ***
## Time_in_Force -5.600e-02 8.233e-03 -6.803 1.03e-11 ***
## Red_Car -2.122e-02 9.649e-02 -0.220 0.825963
## Prev_Claims -1.538e-05 4.294e-06 -3.583 0.000340 ***
## Claim_freq 2.058e-01 3.181e-02 6.471 9.71e-11 ***
## Revoked 9.349e-01 1.003e-01 9.322 < 2e-16 ***
## Record_Points 1.141e-01 1.532e-02 7.448 9.47e-14 ***
## Car_Age 1.528e-03 8.058e-03 0.190 0.849599
## Education_High_School -2.106e-02 1.061e-01 -0.199 0.842653
## Education_Bachelors -4.902e-01 1.271e-01 -3.858 0.000114 ***
## Education_Masters -3.086e-01 1.946e-01 -1.586 0.112837
## Education_PhD -5.371e-01 2.219e-01 -2.421 0.015476 *
## Job_Clerical 9.212e-02 1.173e-01 0.785 0.432169
## Job_Doctor -5.457e-01 2.109e-01 -2.588 0.009663 **
## Job_Home_Maker -2.718e-01 1.828e-01 -1.487 0.137037
## Job_Lawyer -3.307e-01 1.949e-01 -1.697 0.089678 .
## Job_Manager -8.898e-01 1.582e-01 -5.624 1.87e-08 ***
## Job_Professional -2.060e-01 1.347e-01 -1.529 0.126162
## Job_Student -4.262e-01 1.627e-01 -2.620 0.008792 **
## Urbanicity_Urban 2.384e+00 1.261e-01 18.895 < 2e-16 ***
## Car_Type_Panel_Truck 4.656e-01 1.801e-01 2.585 0.009725 **
## Car_Type_Pickup 5.811e-01 1.120e-01 5.186 2.14e-07 ***
## Car_Type_Sports_Car 1.030e+00 1.458e-01 7.063 1.62e-12 ***
## Car_Type_SUV 8.412e-01 1.244e-01 6.764 1.34e-11 ***
## Car_Type_Van 5.541e-01 1.409e-01 3.931 8.44e-05 ***
## Sex_Male 1.610e-01 1.249e-01 1.289 0.197426
## Car_Purpose_Private -7.762e-01 9.985e-02 -7.774 7.62e-15 ***
## log_Income -8.485e-02 1.906e-02 -4.452 8.50e-06 ***
## log_Age -3.703e-01 1.962e-01 -1.887 0.059114 .
## log_House_Worth -2.563e-02 7.521e-03 -3.408 0.000654 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7533.1 on 6527 degrees of freedom
## Residual deviance: 5813.3 on 6491 degrees of freedom
## AIC: 5887.3
##
## Number of Fisher Scoring iterations: 5
Income: The odds ratio of 0.919 suggests that for each 1% increase in income, the odds of having a crash decrease by about 8.1%. Higher income is associated with a lower likelihood of being involved in a crash. Specifically, for every 1% increase in income, the odds of having a crash decrease by about 8.1%. This could imply that individuals with higher income might be less likely to engage in risky driving behaviors, or may have better access to safer vehicles and driving conditions.
Teen_Driver and Single_Parent both significantly increase the likelihood of a crash, with odds approximately 1.6 times and 1.46 times higher, respectively.
Marital_Status has a protective effect, with married individuals having about 37% lower odds of a crash.
Travel_Time and Claim_freq are both positively related to crashes, with small but significant increases in the odds for each additional minute of travel and each additional claim in the past.
log_Age: A 1% increase in age reduces the odds of a crash by about 30.2%.
log_House_Worth: A 1% increase in house worth reduces the odds of a crash by about 2.5%.
vif(logit_model3)
## Teen_Driver Child_count Job_Tenure
## 1.376513 2.234487 2.253563
## Single_Parent Marital_Status Travel_Time
## 1.919129 2.161893 1.036463
## Bluebook Time_in_Force Red_Car
## 2.117413 1.010912 1.826999
## Prev_Claims Claim_freq Revoked
## 1.625181 1.453600 1.304097
## Record_Points Car_Age Education_High_School
## 1.162441 1.997002 2.295532
## Education_Bachelors Education_Masters Education_PhD
## 2.900143 5.507244 3.187359
## Job_Clerical Job_Doctor Job_Home_Maker
## 1.807290 2.524364 2.262587
## Job_Lawyer Job_Manager Job_Professional
## 3.829665 1.839548 1.915930
## Job_Student Urbanicity_Urban Car_Type_Panel_Truck
## 2.185151 1.145092 2.414511
## Car_Type_Pickup Car_Type_Sports_Car Car_Type_SUV
## 1.822483 2.142494 3.069440
## Car_Type_Van Sex_Male Car_Purpose_Private
## 1.639218 3.662596 2.304570
## log_Income log_Age log_House_Worth
## 3.572014 1.508095 1.780199
# Perform 10-fold cross-validation
train_control <- trainControl(method = "cv", number = 10) # 10-fold cross-validation
# Train the model with cross-validation
cv_model3 <- train(Crash_Flag ~ . -Crash_Cost -log_Crash_Cost -Income -Age -House_Worth + log_Income + log_Age + log_House_Worth,
data = train_f,
method = "glm",
family = "binomial",
trControl = train_control)
# Summary of the cross-validation results
print(cv_model3)
## Generalized Linear Model
##
## 6528 samples
## 41 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 5875, 5875, 5875, 5875, 5876, 5875, ...
## Resampling results:
##
## Accuracy Kappa
## 0.7882978 0.3824096
# Make predictions on the training data (after cross-validation)
lasso_pred3 <- predict(cv_model3, newdata = train_f, type = "prob")
# Extract probabilities for the positive class (class 1)
lasso_pred3_class <- lasso_pred3[, 2]
# Predict the class (1 if probability > 0.5, else 0)
lasso_class3 <- ifelse(lasso_pred3_class > 0.5, 1, 0)
# Confusion Matrix for model 3
conf_matrix3 <- table(Predicted = lasso_class3, Actual = train_f$Crash_Flag)
# Calculate accuracy, precision, recall, and F1-score for model 3
accuracy3 <- sum(diag(conf_matrix3)) / sum(conf_matrix3)
precision3 <- conf_matrix3[2, 2] / sum(conf_matrix3[2, ])
recall3 <- conf_matrix3[2, 2] / sum(conf_matrix3[, 2])
f1_score3 <- 2 * (precision3 * recall3) / (precision3 + recall3)
cat("Model 3 Accuracy: ", accuracy3, "\n")
## Model 3 Accuracy: 0.7930453
cat("Model 3 Precision: ", precision3, "\n")
## Model 3 Precision: 0.6697164
cat("Model 3 Recall: ", recall3, "\n")
## Model 3 Recall: 0.4250871
cat("Model 3 F1-Score: ", f1_score3, "\n")
## Model 3 F1-Score: 0.520071
We see that this model give a slightly higher accuracy. So we will proceed with this.
# Exclude the response and unwanted variables
train_f_ridge <- train_f[, !names(train_f) %in% c("Crash_Cost", "log_Crash_Cost")] # Exclude response and unwanted variables
X <- model.matrix(Crash_Flag ~ ., train_f_ridge)[, -1] # Convert to matrix and remove intercept column
y <- train_f$Crash_Flag # Response variable
# Set a range of lambda values for cross-validation
lambda_grid <- 10^seq(3, -3, length = 100)
# Fit the Ridge model using glmnet (alpha = 0 for Ridge)
ridge_model <- cv.glmnet(X, y, alpha = 0, lambda = lambda_grid, family = "binomial")
# View the best lambda value
ridge_model$lambda.min # Best lambda based on cross-validation
## [1] 0.001
# Plot the cross-validation curve
plot(ridge_model)
# Get the coefficients for the best lambda
coef(ridge_model, s = "lambda.min")
## 40 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 9.398596e+00
## Teen_Driver 4.930501e-01
## Age 8.662841e-02
## Child_count 2.305280e-02
## Job_Tenure -2.077821e-03
## Income -4.079642e-06
## Single_Parent 3.365570e-01
## House_Worth 4.461192e-07
## Marital_Status -4.834584e-01
## Travel_Time 1.307039e-02
## Bluebook -2.052077e-05
## Time_in_Force -5.595448e-02
## Red_Car -4.182434e-02
## Prev_Claims -1.464760e-05
## Claim_freq 2.010904e-01
## Revoked 9.227511e-01
## Record_Points 1.098507e-01
## Car_Age 2.392477e-03
## Education_High_School -6.204068e-03
## Education_Bachelors -4.347815e-01
## Education_Masters -2.351675e-01
## Education_PhD -3.622559e-01
## Job_Clerical 4.839124e-02
## Job_Doctor -5.106872e-01
## Job_Home_Maker -3.079144e-01
## Job_Lawyer -3.007698e-01
## Job_Manager -8.590562e-01
## Job_Professional -1.870325e-01
## Job_Student -4.741720e-01
## Urbanicity_Urban 2.339567e+00
## Car_Type_Panel_Truck 5.150674e-01
## Car_Type_Pickup 5.668864e-01
## Car_Type_Sports_Car 9.293479e-01
## Car_Type_SUV 7.678674e-01
## Car_Type_Van 5.812758e-01
## Sex_Male 1.037209e-01
## Car_Purpose_Private -7.818775e-01
## log_Income -5.341475e-02
## log_Age -4.002900e+00
## log_House_Worth -3.255051e-02
# Predict on the training data using the best lambda
ridge_pred <- predict(ridge_model, X, s = "lambda.min", type = "response")
# Evaluate model performance (e.g., AUC, accuracy, etc.)
library(pROC)
roc_curve_ridge <- roc(y, ridge_pred)
plot(roc_curve_ridge, main = "ROC Curve for Ridge Model")
auc(roc_curve_ridge) # AUC value
## Area under the curve: 0.818
# You can also calculate accuracy, confusion matrix, etc.
ridge_class <- ifelse(ridge_pred > 0.5, 1, 0)
table(Predicted = ridge_class, Actual = y) # Confusion matrix
## Actual
## Predicted 0 1
## 0 4447 988
## 1 359 734
# Calculate the confusion matrix from the predicted values
conf_matrix_ridge <- table(Predicted = ridge_class, Actual = y)
# Accuracy: Proportion of correct predictions
accuracy_ridge <- sum(diag(conf_matrix_ridge)) / sum(conf_matrix_ridge)
cat("Accuracy (Ridge): ", accuracy_ridge, "\n")
## Accuracy (Ridge): 0.7936581
# Precision: Proportion of positive predictions that are correct
precision_ridge <- conf_matrix_ridge[2, 2] / sum(conf_matrix_ridge[2, ])
cat("Precision (Ridge): ", precision_ridge, "\n")
## Precision (Ridge): 0.6715462
# Recall: Proportion of actual positives that are correctly identified
recall_ridge <- conf_matrix_ridge[2, 2] / sum(conf_matrix_ridge[, 2])
cat("Recall (Ridge): ", recall_ridge, "\n")
## Recall (Ridge): 0.4262485
# F1-Score: Harmonic mean of precision and recall
f1_score_ridge <- 2 * (precision_ridge * recall_ridge) / (precision_ridge + recall_ridge)
cat("F1-Score (Ridge): ", f1_score_ridge, "\n")
## F1-Score (Ridge): 0.521492
# Predicting on the training data using la_eq model
pred_train <- predict(la_eq, newdata = train_f)
# Calculate Mean Squared Error (MSE)
mse <- mean((train_f$log_Crash_Cost - pred_train)^2)
cat("Mean Squared Error (MSE):", mse, "\n")
## Mean Squared Error (MSE): 10.38533
# Calculate R-squared
rss <- sum((train_f$log_Crash_Cost - pred_train)^2)
tss <- sum((train_f$log_Crash_Cost - mean(train_f$log_Crash_Cost))^2)
r_squared <- 1 - rss/tss
cat("R-squared:", r_squared, "\n")
## R-squared: 0.2272999
# Calculate Adjusted R-squared
n <- nrow(train_f) # number of observations
p <- ncol(train_f) # number of predictors
adjusted_r_squared <- 1 - (1 - r_squared) * (n - 1) / (n - p - 1)
cat("Adjusted R-squared:", adjusted_r_squared, "\n")
## Adjusted R-squared: 0.2222955
# Calculate F-statistic
f_statistic <- (r_squared / (p - 1)) / ((1 - r_squared) / (n - p))
cat("F-statistic:", f_statistic, "\n")
## F-statistic: 46.53517
# Residuals vs Fitted Values plot
plot(pred_train, train_f$log_Crash_Cost - pred_train,
main = "Residuals vs Fitted Values",
xlab = "Fitted Values",
ylab = "Residuals")
abline(h = 0, col = "red")
# Normal Q-Q Plot to check if residuals are normally distributed
qqnorm(train_f$log_Crash_Cost - pred_train)
qqline(train_f$log_Crash_Cost - pred_train, col = "red")
# Scale-Location plot
plot(pred_train, sqrt(abs(train_f$log_Crash_Cost - pred_train)),
main = "Scale-Location Plot",
xlab = "Fitted Values",
ylab = "Sqrt of |Residuals|")
abline(h = 0, col = "red")
# Residuals vs Leverage plot
plot(hatvalues(la_eq), train_f$log_Crash_Cost - pred_train,
main = "Residuals vs Leverage",
xlab = "Leverage",
ylab = "Residuals")
abline(h = 0, col = "red")
Discrete-Like Distribution: Crash_Cost
values are clustered or concentrated around specific points (rather than
exhibiting a smooth continuous distribution), the model will have
difficulty predicting variations in this variable. It lacks meaningful
variation which high values of crash cost = 0 due to the imbalanced
nature of the dataset.
df_test$SEX <- dplyr::recode(df_test$SEX,
"M" = "Male",
"z_F" = "Female"
)
df_test$CAR_TYPE <- dplyr::recode(df_test$CAR_TYPE,
"z_SUV" = "SUV"
)
df_test$EDUCATION <- dplyr::recode(df_test$EDUCATION,
"z_High School" = "High School",
"<High School" = "Before HS"
)
df_test$MSTATUS <- dplyr::recode(df_test$MSTATUS,
"z_No" = "No"
)
df_test$RED_CAR <- dplyr::recode(df_test$RED_CAR,
"yes" = "Yes",
"no" = "No"
)
df_test$URBANICITY <- dplyr::recode(df_test$URBANICITY,
"Highly Urban/ Urban" = "Urban",
"z_Highly Rural/ Rural" = "Rural"
)
df_test$JOB <- dplyr::recode(df_test$JOB,
"z_Blue Collar" = "Blue Collar")
# Renaming the col names for easy read
df_test <- df_test |>
dplyr::rename(Crash_Flag = TARGET_FLAG,
Crash_Cost = TARGET_AMT,
Teen_Driver = KIDSDRIV,
Child_count = HOMEKIDS,
Income = INCOME,
Job_Tenure = YOJ,
House_Worth = HOME_VAL,
Marital_Status = MSTATUS,
Sex = SEX,
Job = JOB,
Age = AGE,
Car_Purpose = CAR_USE,
Bluebook = BLUEBOOK,
Time_in_Force = TIF,
Car_Type = CAR_TYPE,
Red_Car= RED_CAR,
Prev_Claims = OLDCLAIM,
Education = EDUCATION,
Single_Parent = PARENT1,
Revoked = REVOKED,
Record_Points = MVR_PTS,
Car_Age = CAR_AGE,
Urbanicity = URBANICITY,
Claim_freq = CLM_FREQ,
Travel_Time = TRAVTIME
)
df_test$Single_Parent <- as.factor(df_test$Single_Parent)
df_test$Marital_Status <- as.factor(df_test$Marital_Status)
df_test$Sex <- as.factor(df_test$Sex)
df_test$Education <- as.factor(df_test$Education)
df_test$Job <- as.factor(df_test$Job)
df_test$Car_Purpose <- as.factor(df_test$Car_Purpose)
df_test$Bluebook <- as.factor(df_test$Bluebook)
df_test$Prev_Claims <- as.factor(df_test$Prev_Claims)
df_test$Revoked <- as.factor(df_test$Revoked)
df_test$Urbanicity <- as.factor(df_test$Urbanicity)
df_test$Car_Type <- as.factor(df_test$Car_Type)
df_test$Red_Car <- as.factor(df_test$Red_Car)
# Remove non-numeric characters and convert to integer
df_test$Bluebook <- as.integer(gsub("[^0-9]", "", df_test$Bluebook))
df_test$Prev_Claims <- as.integer(gsub("[^0-9]", "", df_test$Prev_Claims))
vis_dat(df_test) +
theme(
axis.text.x = element_text(size = 6, angle = 90, hjust = 1),
axis.text.y = element_text(size = 6)
)
# Converting RED_CAR and REVOKED to binary
df_test$Red_Car <- ifelse(df_test$Red_Car == "Yes", 1, 0)
df_test$Revoked <- ifelse(df_test$Revoked == "Yes", 1, 0)
df_test$Marital_Status <- ifelse(df_test$Marital_Status == "Yes", 1, 0)
df_test$Single_Parent <- ifelse(df_test$Single_Parent == "Yes", 1, 0)
# Check the levels of Education in the training data
levels(df_test$Education)
## [1] "Bachelors" "Before HS" "High School" "Masters" "PhD"
df_test$Education <- factor(df_test$Education, levels = c("Before HS", "High School", "Bachelors", "Masters", "PhD"))
test_f<- dummy_cols(df_test,
select_columns = c("Education", "Job", "Urbanicity", "Car_Type", "Sex", "Car_Purpose"),
remove_first_dummy = TRUE,
remove_selected_columns = TRUE)
colnames(test_f) <- gsub(" ", "_", colnames(test_f))
stargazer(test_f, type = "text", title = "Summary of df_test", summary = TRUE)
##
## Summary of df
## ====================================================================
## Statistic N Mean St. Dev. Min Max
## --------------------------------------------------------------------
## Crash_Flag 1,633 0.264 0.441 0 1
## Crash_Cost 1,633 1,655.067 5,288.826 0.000 73,783.470
## Teen_Driver 1,633 0.172 0.523 0 4
## Age 1,633 44.524 8.535 16 72
## Child_count 1,633 0.731 1.156 0 5
## Job_Tenure 1,633 10.426 4.056 0 19
## Income 1,633 62,536.720 47,106.420 0 290,846
## Single_Parent 1,633 0.135 0.342 0 1
## House_Worth 1,633 158,398.200 130,443.400 0 682,634
## Marital_Status 1,633 0.596 0.491 0 1
## Travel_Time 1,633 33.659 15.679 5 113
## Bluebook 1,633 15,982.280 8,568.104 1,500 50,970
## Time_in_Force 1,633 5.289 4.134 1 22
## Red_Car 1,633 0.288 0.453 0 1
## Prev_Claims 1,633 3,708.301 8,155.113 0 53,477
## Claim_freq 1,633 0.798 1.158 0 5
## Revoked 1,633 0.104 0.305 0 1
## Record_Points 1,633 1.677 2.156 0 13
## Car_Age 1,633 8.409 5.645 0 28
## Education_High_School 1,633 0.288 0.453 0 1
## Education_Bachelors 1,633 0.282 0.450 0 1
## Education_Masters 1,633 0.200 0.400 0 1
## Education_PhD 1,633 0.089 0.285 0 1
## Job_Clerical 1,633 0.133 0.340 0 1
## Job_Doctor 1,633 0.077 0.266 0 1
## Job_Home_Maker 1,633 0.079 0.270 0 1
## Job_Lawyer 1,633 0.113 0.317 0 1
## Job_Manager 1,633 0.141 0.348 0 1
## Job_Professional 1,633 0.141 0.348 0 1
## Job_Student 1,633 0.088 0.284 0 1
## Urbanicity_Urban 1,633 0.802 0.398 0 1
## Car_Type_Panel_Truck 1,633 0.092 0.289 0 1
## Car_Type_Pickup 1,633 0.173 0.379 0 1
## Car_Type_Sports_Car 1,633 0.118 0.322 0 1
## Car_Type_SUV 1,633 0.283 0.451 0 1
## Car_Type_Van 1,633 0.093 0.291 0 1
## Sex_Male 1,633 0.460 0.499 0 1
## Car_Purpose_Private 1,633 0.620 0.486 0 1
## --------------------------------------------------------------------
# Log transformation of the Crash_Cost column
test_f$log_Crash_Cost <- log(test_f$Crash_Cost + 1)
# Apply the same dummy transformation to the test set as done for the training set
test_lasso <- test_f[, !names(test_f) %in% c("Crash_Cost", "log_Crash_Cost")]
X_test_lasso <- model.matrix(log_Crash_Cost ~ Bluebook + Income + Single_Parent + House_Worth + Marital_Status + Sex_Male + Education_High_School + Education_Bachelors + Education_Masters + Travel_Time + Car_Purpose_Private + Teen_Driver + Age + Child_count + Job_Tenure + Car_Type_Panel_Truck + Car_Type_Pickup + Car_Type_Sports_Car + Car_Type_SUV + Car_Type_Van + Prev_Claims + Claim_freq + Revoked + Record_Points + Urbanicity_Urban + Job_Clerical + Job_Doctor + Job_Home_Maker + Job_Lawyer + Job_Manager + Job_Professional + Job_Student + Time_in_Force,
data = test_f)[, -1] # Remove intercept column
# Scale the test data predictors using the scaling parameters from the training set
X_test_lasso <- scale(X_test_lasso, center = attr(X2, "scaled:center"), scale = attr(X2, "scaled:scale"))
# Make predictions using the linear model `la_eq`
predictions_test <- predict(la_eq, newdata = as.data.frame(X_test_lasso))
# Display predictions
head(predictions_test)
## 1 2 3 4 5 6
## -0.3102340 4.5151068 3.6404360 0.7792394 0.6396287 3.4613205
# Actual values
actual_values <- test_f$Crash_Cost
# Calculate Mean Squared Error (MSE)
mse <- mean((predictions_test - actual_values)^2)
cat("Mean Squared Error (MSE):", mse, "\n")
## Mean Squared Error (MSE): 30705078
# F-statistic from the model summary
f_statistic <- summary(la_eq)$fstatistic[1]
cat("F-statistic:", f_statistic, "\n")
## F-statistic: 148.0358
# R-squared value from the linear model summary
r_squared <- summary(la_eq)$r.squared
cat("R-squared:", r_squared, "\n")
## R-squared: 0.429271
The model explains about 43% of the variability in
Crash_Cost
(R-squared = 0.4293), indicating moderate
predictive power. While the F-statistic suggests the model is
statistically significant, the high Mean Squared Error (MSE =
30,705,078) indicates that the predictions deviate considerably from
actual values, pointing to room for improvement in model accuracy and
feature selection.
#Reverse form log
crash_cost_predictions <- exp(predictions_test)
We see that some continuous value close to 0 has been assigned to rows where crashes did not occur. So we will not include them as part of our predictions.
# Set predictions to 0 where Crash_Flag is 0
predictions_test[test_f$Crash_Flag == 0] <- 0
# Filter predictions for rows where Crash_Flag is not 0
predictions_non_zero_flag <- predictions_test[test_f$Crash_Flag != 0]
# Plot the histogram
hist(predictions_non_zero_flag, main = "Histogram of Crash Cost Predictions",
xlab = "Predicted Crash Cost", col = "blue", border = "black", breaks = 30)
# Prepare the test set (ensure it matches the training set structure)
test_lasso <- test_f[, !names(test_f) %in% c("Crash_Cost", "log_Crash_Cost")] # Exclude the target variable and unwanted columns
X_test <- model.matrix(Crash_Flag ~ ., test_lasso)[, -1] # Convert test data to matrix, remove intercept column
# Use the Lasso model to make predictions on the test set
lasso_pred_test <- predict(lasso_model, newx = X_test, s = "lambda.min", type = "response")
# Convert probabilities to class labels (threshold = 0.5 for classification)
lasso_class_test <- ifelse(lasso_pred_test > 0.5, 1, 0)
# View the confusion matrix for model performance
conf_matrix_test <- table(Predicted = lasso_class_test, Actual = test_f$Crash_Flag)
print(conf_matrix_test)
## Actual
## Predicted 0 1
## 0 1114 252
## 1 88 179
# Calculate Accuracy, Precision, Recall, F1-Score for the test data
accuracy_test <- sum(diag(conf_matrix_test)) / sum(conf_matrix_test)
precision_test <- conf_matrix_test[2, 2] / sum(conf_matrix_test[2, ])
recall_test <- conf_matrix_test[2, 2] / sum(conf_matrix_test[, 2])
f1_score_test <- 2 * (precision_test * recall_test) / (precision_test + recall_test)
cat("Test Accuracy: ", accuracy_test, "\n")
## Test Accuracy: 0.7917942
cat("Test Precision: ", precision_test, "\n")
## Test Precision: 0.670412
cat("Test Recall: ", recall_test, "\n")
## Test Recall: 0.4153132
cat("Test F1-Score: ", f1_score_test, "\n")
## Test F1-Score: 0.512894
# Classification Error Rate (Error Rate = 1 - Accuracy)
classification_error_rate <- 1 - accuracy_test
cat("Classification Error Rate: ", classification_error_rate, "\n")
## Classification Error Rate: 0.2082058
# Sensitivity (True Positive Rate)
sensitivity_test <- conf_matrix_test[2, 2] / (conf_matrix_test[2, 2] + conf_matrix_test[2, 1])
cat("Sensitivity: ", sensitivity_test, "\n")
## Sensitivity: 0.670412
# Specificity (True Negative Rate)
specificity_test <- conf_matrix_test[1, 1] / (conf_matrix_test[1, 1] + conf_matrix_test[1, 2])
cat("Specificity: ", specificity_test, "\n")
## Specificity: 0.8155198
Sensitivity (or recall): 64.7% — The model correctly predicts 64.7% of the actual crash events.
Specificity: 81.3% — The model correctly identifies 81.3% of non-crash events.
Precision: 64.7% — Among the predicted crashes, 64.7% are true positives.
F1-score: 50.1% — The harmonic mean of precision and recall, providing a balanced measure of the model’s performance.
In conclusion, the project aimed to predict the occurrence of a crash and the Crash Cost associated with it. For predicting the occurrence of a crash, the logistic regression model achieved an accuracy of 78.5%, with a precision of 64.7% and a recall of 40.8%. The F1-score was 50.1%, indicating a balanced trade-off between precision and recall. While the model performed reasonably well in terms of overall accuracy, there is still room for improvement, particularly in detecting crash occurrences more effectively.
For Crash Cost, the linear regression model provided insights with an R-squared value of 0.43. This indicates that approximately 43% of the variance in crash cost can be explained by the features used in the model. Although the model’s performance is acceptable, the relatively low R-squared suggests that additional features, such as the extent of damage, weather conditions, and other crash-related variables, could significantly improve the prediction of crash costs.
The inclusion of additional variables, like the extent of damage, would provide a more comprehensive understanding of both the likelihood of crashes and their financial implications, leading to more accurate predictions and better decision-making for risk management.