1 Background
2 Data Exploration
3. Data Preparation
4 Build Models
5 Select Models
Linear Model Evaluation
- 6 Predictions on evaluated data

1 Background

In this homework assignment, you will explore, analyze and model a data set containing approximately 8000 records representing a customer at an auto insurance company. Each record has two response variables. The first response variable, TARGET_FLAG, is a 1 or a 0. A “1” means that the person was in a car crash. A zero means that the person was not in a car crash. The second response variable is TARGET_AMT. This value is zero if the person did not crash their car. But if they did crash their car, this number will be a value greater than zero. Your objective is to build multiple linear regression and binary logistic regression models on the training data to predict the probability that a person will crash their car and also the amount of money it will cost if the person does crash their car. You can only use the variables given to you (or variables that you derive from the variables provided). Below is a short description of the variables of interest in the data set:

imagename

2 Data Exploration

2.1 Data Import

# https://github.com/datanerddhanya/DATA621/blob/3642a16df2ca3efabe5250ab37b6b6994e441fd4/Variable_names.jpg

data_train <- read.csv("https://raw.githubusercontent.com/datanerddhanya/DATA621/refs/heads/main/insurance_training_data.csv")
data_eval <- read.csv("https://raw.githubusercontent.com/datanerddhanya/DATA621/refs/heads/main/insurance-evaluation-data.csv")

2.2 Data Cleaning

The insurance training dataset contains 8161 observations and 26 variables, each customer at an auto insurance company.

The results reveal that we need to fix data format by removing $ and , for INCOME, HOME_VAL,BLUEBOOK, OLDCLAIM.

There is a min value of -3 for CAR_AGE which does not make sense, hence updating it to NA.

JOB, MSSTATUS, EDUCATION HAVE z_ prefixed in its values, hence need to replace them with blank.

Additionally, there are no duplicates in this dataset.

# Structure of the data 
str(data_train)

## 'data.frame':    8161 obs. of  26 variables:
##  $ INDEX      : int  1 2 4 5 6 7 8 11 12 13 ...
##  $ TARGET_FLAG: int  0 0 0 0 0 1 0 1 1 0 ...
##  $ TARGET_AMT : num  0 0 0 0 0 ...
##  $ KIDSDRIV   : int  0 0 0 0 0 0 0 1 0 0 ...
##  $ AGE        : int  60 43 35 51 50 34 54 37 34 50 ...
##  $ HOMEKIDS   : int  0 0 1 0 0 1 0 2 0 0 ...
##  $ YOJ        : int  11 11 10 14 NA 12 NA NA 10 7 ...
##  $ INCOME     : chr  "$67,349" "$91,449" "$16,039" "" ...
##  $ PARENT1    : chr  "No" "No" "No" "No" ...
##  $ HOME_VAL   : chr  "$0" "$257,252" "$124,191" "$306,251" ...
##  $ MSTATUS    : chr  "z_No" "z_No" "Yes" "Yes" ...
##  $ SEX        : chr  "M" "M" "z_F" "M" ...
##  $ EDUCATION  : chr  "PhD" "z_High School" "z_High School" "<High School" ...
##  $ JOB        : chr  "Professional" "z_Blue Collar" "Clerical" "z_Blue Collar" ...
##  $ TRAVTIME   : int  14 22 5 32 36 46 33 44 34 48 ...
##  $ CAR_USE    : chr  "Private" "Commercial" "Private" "Private" ...
##  $ BLUEBOOK   : chr  "$14,230" "$14,940" "$4,010" "$15,440" ...
##  $ TIF        : int  11 1 4 7 1 1 1 1 1 7 ...
##  $ CAR_TYPE   : chr  "Minivan" "Minivan" "z_SUV" "Minivan" ...
##  $ RED_CAR    : chr  "yes" "yes" "no" "yes" ...
##  $ OLDCLAIM   : chr  "$4,461" "$0" "$38,690" "$0" ...
##  $ CLM_FREQ   : int  2 0 2 0 2 0 0 1 0 0 ...
##  $ REVOKED    : chr  "No" "No" "No" "No" ...
##  $ MVR_PTS    : int  3 0 3 0 3 0 0 10 0 1 ...
##  $ CAR_AGE    : int  18 1 10 6 17 7 1 7 1 17 ...
##  $ URBANICITY : chr  "Highly Urban/ Urban" "Highly Urban/ Urban" "Highly Urban/ Urban" "Highly Urban/ Urban" ...

# Glimpse of the data
head(data_train)

##   INDEX TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ   INCOME PARENT1
## 1     1           0          0        0  60        0  11  $67,349      No
## 2     2           0          0        0  43        0  11  $91,449      No
## 3     4           0          0        0  35        1  10  $16,039      No
## 4     5           0          0        0  51        0  14               No
## 5     6           0          0        0  50        0  NA $114,986      No
## 6     7           1       2946        0  34        1  12 $125,301     Yes
##   HOME_VAL MSTATUS SEX     EDUCATION           JOB TRAVTIME    CAR_USE BLUEBOOK
## 1       $0    z_No   M           PhD  Professional       14    Private  $14,230
## 2 $257,252    z_No   M z_High School z_Blue Collar       22 Commercial  $14,940
## 3 $124,191     Yes z_F z_High School      Clerical        5    Private   $4,010
## 4 $306,251     Yes   M  <High School z_Blue Collar       32    Private  $15,440
## 5 $243,925     Yes z_F           PhD        Doctor       36    Private  $18,000
## 6       $0    z_No z_F     Bachelors z_Blue Collar       46 Commercial  $17,430
##   TIF   CAR_TYPE RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS CAR_AGE
## 1  11    Minivan     yes   $4,461        2      No       3      18
## 2   1    Minivan     yes       $0        0      No       0       1
## 3   4      z_SUV      no  $38,690        2      No       3      10
## 4   7    Minivan     yes       $0        0      No       0       6
## 5   1      z_SUV      no  $19,217        2     Yes       3      17
## 6   1 Sports Car      no       $0        0      No       0       7
##            URBANICITY
## 1 Highly Urban/ Urban
## 2 Highly Urban/ Urban
## 3 Highly Urban/ Urban
## 4 Highly Urban/ Urban
## 5 Highly Urban/ Urban
## 6 Highly Urban/ Urban

# Summary statistics
data_train %>%
  summary() %>%
  kable(caption = "Descriptive Statistics of Predictor Variables") %>%
  kable_styling()

Descriptive Statistics of Predictor Variables
INDEX	TARGET_FLAG	TARGET_AMT	KIDSDRIV	AGE	HOMEKIDS	YOJ	INCOME	PARENT1	HOME_VAL	MSTATUS	SEX	EDUCATION	JOB	TRAVTIME	CAR_USE	BLUEBOOK	TIF	CAR_TYPE	RED_CAR	OLDCLAIM	CLM_FREQ	REVOKED	MVR_PTS	CAR_AGE	URBANICITY
Min. : 1	Min. :0.0000	Min. : 0	Min. :0.0000	Min. :16.00	Min. :0.0000	Min. : 0.0	Length:8161	Length:8161	Length:8161	Length:8161	Length:8161	Length:8161	Length:8161	Min. : 5.00	Length:8161	Length:8161	Min. : 1.000	Length:8161	Length:8161	Length:8161	Min. :0.0000	Length:8161	Min. : 0.000	Min. :-3.000	Length:8161
1st Qu.: 2559	1st Qu.:0.0000	1st Qu.: 0	1st Qu.:0.0000	1st Qu.:39.00	1st Qu.:0.0000	1st Qu.: 9.0	Class :character	Class :character	Class :character	Class :character	Class :character	Class :character	Class :character	1st Qu.: 22.00	Class :character	Class :character	1st Qu.: 1.000	Class :character	Class :character	Class :character	1st Qu.:0.0000	Class :character	1st Qu.: 0.000	1st Qu.: 1.000	Class :character
Median : 5133	Median :0.0000	Median : 0	Median :0.0000	Median :45.00	Median :0.0000	Median :11.0	Mode :character	Mode :character	Mode :character	Mode :character	Mode :character	Mode :character	Mode :character	Median : 33.00	Mode :character	Mode :character	Median : 4.000	Mode :character	Mode :character	Mode :character	Median :0.0000	Mode :character	Median : 1.000	Median : 8.000	Mode :character
Mean : 5152	Mean :0.2638	Mean : 1504	Mean :0.1711	Mean :44.79	Mean :0.7212	Mean :10.5	NA	NA	NA	NA	NA	NA	NA	Mean : 33.49	NA	NA	Mean : 5.351	NA	NA	NA	Mean :0.7986	NA	Mean : 1.696	Mean : 8.328	NA
3rd Qu.: 7745	3rd Qu.:1.0000	3rd Qu.: 1036	3rd Qu.:0.0000	3rd Qu.:51.00	3rd Qu.:1.0000	3rd Qu.:13.0	NA	NA	NA	NA	NA	NA	NA	3rd Qu.: 44.00	NA	NA	3rd Qu.: 7.000	NA	NA	NA	3rd Qu.:2.0000	NA	3rd Qu.: 3.000	3rd Qu.:12.000	NA
Max. :10302	Max. :1.0000	Max. :107586	Max. :4.0000	Max. :81.00	Max. :5.0000	Max. :23.0	NA	NA	NA	NA	NA	NA	NA	Max. :142.00	NA	NA	Max. :25.000	NA	NA	NA	Max. :5.0000	NA	Max. :13.000	Max. :28.000	NA
NA	NA	NA	NA	NA’s :6	NA	NA’s :454	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA’s :510	NA

# removing 'z_'
data_train <- as.data.frame(lapply(data_train, function(x) gsub("^z_", "", x)))


# Fix data format for amount variables , 
data_train$INCOME = as.numeric(gsub("[$,]", "",data_train$INCOME))
data_train$HOME_VAL = as.numeric(gsub("[$,]", "",data_train$HOME_VAL))
data_train$BLUEBOOK = as.numeric(gsub("[$,]", "",data_train$BLUEBOOK))
data_train$OLDCLAIM = as.numeric(gsub("[$,]", "",data_train$OLDCLAIM))

#CAR_AGE has a messy value -3, hence need to fix it.
data_train$CAR_AGE =  ifelse(data_train$CAR_AGE < 0, NA, data_train$CAR_AGE)

data_train$EDUCATION = gsub("<", "",data_train$EDUCATION)

# Convert back to integer and numeric fields as these became characters when removing z_
data_train$TARGET_FLAG = as.integer(data_train$TARGET_FLAG)
data_train$KIDSDRIV = as.integer(data_train$KIDSDRIV)
data_train$AGE = as.integer(data_train$AGE)
data_train$HOMEKIDS = as.integer(data_train$HOMEKIDS)
data_train$TRAVTIME = as.integer(data_train$TRAVTIME)
data_train$YOJ = as.integer(data_train$YOJ)
data_train$TIF = as.integer(data_train$TIF)
data_train$CLM_FREQ = as.integer(data_train$CLM_FREQ)
data_train$MVR_PTS = as.integer(data_train$MVR_PTS)
data_train$CAR_AGE = as.integer(data_train$CAR_AGE)

data_train$TARGET_AMT= as.numeric(data_train$TARGET_AMT)


# Convert categorical variables to factors
  data_train$EDUCATION <- factor(data_train$EDUCATION)
  data_train$JOB <- factor(data_train$JOB)
  data_train$CAR_TYPE <- factor(data_train$CAR_TYPE)
  data_train$CAR_USE <- factor(data_train$CAR_USE)
  data_train$MSTATUS <- factor(data_train$MSTATUS)
  data_train$PARENT1 <- factor(data_train$PARENT1)
  data_train$RED_CAR <- factor(data_train$RED_CAR)
  data_train$SEX <- factor(data_train$SEX)
  data_train$URBANICITY <- factor(data_train$URBANICITY)
  data_train$REVOKED <- factor(data_train$REVOKED)

#the Index column had no impact on the target variable, it was dropped 
data_train <- data_train[ , -1]

# Check for duplicates
duplicates <- duplicated(data_train)

# Print the duplicates
print(data_train[duplicates, ])

##  [1] TARGET_FLAG TARGET_AMT  KIDSDRIV    AGE         HOMEKIDS    YOJ        
##  [7] INCOME      PARENT1     HOME_VAL    MSTATUS     SEX         EDUCATION  
## [13] JOB         TRAVTIME    CAR_USE     BLUEBOOK    TIF         CAR_TYPE   
## [19] RED_CAR     OLDCLAIM    CLM_FREQ    REVOKED     MVR_PTS     CAR_AGE    
## [25] URBANICITY 
## <0 rows> (or 0-length row.names)

2.3 Data Distribution

The table below provides some summary statistics of the predictor variables in the training dataset. Key metrics such as minimum, maximum, mean, median, and standard deviation (SD) help us understand the range, central tendency, and variability of each variable.

Based on the histogram plots, AGE, CAR_AGE, HOME_VAL and YOJ seem to have normal distribution.

Most of the predictors have right skewed data and hence we need to employ boxcox, log and sqrt transformations in order for the distributions to resemble normal.

Multiple predictors have both a distribution along with values at an extreme. However, based on the feature meanings and provided information, there is no reason to believe that any of these extreme values are mistakes or data errors. As such, we will not remove the extreme values, as they could be predictive of the target.

The TARGET_FLAG has imbalance in the number of customer who crashed their cars and the number of customer who didnt crash which could skew the models towards the customers who did not crash.

The TARGET_AMT needs to be transformed as well as its right skewed. Most costs is concentrated below $20,000.

# Summary statistics
data_train %>%
  summary() %>%
  kable(caption = "Descriptive Statistics of Predictor Variables") %>%
  kable_styling()

Descriptive Statistics of Predictor Variables
TARGET_FLAG	TARGET_AMT	KIDSDRIV	AGE	HOMEKIDS	YOJ	INCOME	PARENT1	HOME_VAL	MSTATUS	SEX	EDUCATION	JOB	TRAVTIME	CAR_USE	BLUEBOOK	TIF	CAR_TYPE	RED_CAR	OLDCLAIM	CLM_FREQ	REVOKED	MVR_PTS	CAR_AGE	URBANICITY
Min. :0.0000	Min. : 0	Min. :0.0000	Min. :16.00	Min. :0.0000	Min. : 0.0	Min. : 0	No :7084	Min. : 0	No :3267	F:4375	Bachelors :2242	Blue Collar :1825	Min. : 5.00	Commercial:3029	Min. : 1500	Min. : 1.000	Minivan :2145	no :5783	Min. : 0	Min. :0.0000	No :7161	Min. : 0.000	Min. : 0.00	Highly Rural/ Rural:1669
1st Qu.:0.0000	1st Qu.: 0	1st Qu.:0.0000	1st Qu.:39.00	1st Qu.:0.0000	1st Qu.: 9.0	1st Qu.: 28097	Yes:1077	1st Qu.: 0	Yes:4894	M:3786	High School:3533	Clerical :1271	1st Qu.: 22.00	Private :5132	1st Qu.: 9280	1st Qu.: 1.000	Panel Truck: 676	yes:2378	1st Qu.: 0	1st Qu.:0.0000	Yes:1000	1st Qu.: 0.000	1st Qu.: 1.00	Highly Urban/ Urban:6492
Median :0.0000	Median : 0	Median :0.0000	Median :45.00	Median :0.0000	Median :11.0	Median : 54028	NA	Median :161160	NA	NA	Masters :1658	Professional:1117	Median : 33.00	NA	Median :14440	Median : 4.000	Pickup :1389	NA	Median : 0	Median :0.0000	NA	Median : 1.000	Median : 8.00	NA
Mean :0.2638	Mean : 1504	Mean :0.1711	Mean :44.79	Mean :0.7212	Mean :10.5	Mean : 61898	NA	Mean :154867	NA	NA	PhD : 728	Manager : 988	Mean : 33.49	NA	Mean :15710	Mean : 5.351	Sports Car : 907	NA	Mean : 4037	Mean :0.7986	NA	Mean : 1.696	Mean : 8.33	NA
3rd Qu.:1.0000	3rd Qu.: 1036	3rd Qu.:0.0000	3rd Qu.:51.00	3rd Qu.:1.0000	3rd Qu.:13.0	3rd Qu.: 85986	NA	3rd Qu.:238724	NA	NA	NA	Lawyer : 835	3rd Qu.: 44.00	NA	3rd Qu.:20850	3rd Qu.: 7.000	SUV :2294	NA	3rd Qu.: 4636	3rd Qu.:2.0000	NA	3rd Qu.: 3.000	3rd Qu.:12.00	NA
Max. :1.0000	Max. :107586	Max. :4.0000	Max. :81.00	Max. :5.0000	Max. :23.0	Max. :367030	NA	Max. :885282	NA	NA	NA	Student : 712	Max. :142.00	NA	Max. :69740	Max. :25.000	Van : 750	NA	Max. :57037	Max. :5.0000	NA	Max. :13.000	Max. :28.00	NA
NA	NA	NA	NA’s :6	NA	NA’s :454	NA’s :445	NA	NA’s :464	NA	NA	NA	(Other) :1413	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA’s :511	NA

# Plot histograms of all numeric variables in data_train
data_train |>
  dplyr::select(where(is.numeric)) |>
  pivot_longer(cols = everything(), names_to = "Feature", values_to = "Value") |>
  filter(!is.na(Value)) |>
  ggplot(aes(x = Value)) +
  geom_histogram(bins = 30, fill = "skyblue", color = "black") +
  facet_wrap(~Feature, scales = "free") +
  labs(title = "Histograms of Numerical Features", x = NULL, y = "Frequency") +
  theme_minimal()

# Bar plot of the target variable (0 = NO CRASH, 1 = CAR CRASH)
ggplot(data_train, aes(x = factor(TARGET_FLAG))) +
  geom_bar(fill = "steelblue", color = "black") +
  labs(
    title = "Distribution of Response variable: TARGET_FLAG",
    x = "Level (0 = the person was not in a car crash, 1 = the person was in a car crash)",
    y = "Count"
  ) +
  theme_minimal()

# Target Amount distribution (for crashes)
hist(as.numeric(data_train$TARGET_AMT[data_train$TARGET_FLAG == 1]), main="TARGET_AMT Distribution",
     xlab="Cost ($)", col="lightblue", breaks=100)

3. Data Preparation

###3.1 Fix Missing Values There are 511 observations where the CAR_AGE variable is missing, 454 observations where variable YOJ is missing, 6 observations where the variable AGE is missing, 445 observations where the variable INCOME is missing, and 464 observations where the variable HOME_VAL is missing. There are 1714, or 21% of the observations missing variables. We will fill in the missing data with the median value.

# Count of missing Values per Variable
missing <- sapply(data_train, function(x) sum(is.na(x)))
kable(data.frame(Variable = names(missing), Missing = missing), caption = "Missing Values in Each Variable")

Missing Values in Each Variable
	Variable	Missing
TARGET_FLAG	TARGET_FLAG	0
TARGET_AMT	TARGET_AMT	0
KIDSDRIV	KIDSDRIV	0
AGE	AGE	6
HOMEKIDS	HOMEKIDS	0
YOJ	YOJ	454
INCOME	INCOME	445
PARENT1	PARENT1	0
HOME_VAL	HOME_VAL	464
MSTATUS	MSTATUS	0
SEX	SEX	0
EDUCATION	EDUCATION	0
JOB	JOB	0
TRAVTIME	TRAVTIME	0
CAR_USE	CAR_USE	0
BLUEBOOK	BLUEBOOK	0
TIF	TIF	0
CAR_TYPE	CAR_TYPE	0
RED_CAR	RED_CAR	0
OLDCLAIM	OLDCLAIM	0
CLM_FREQ	CLM_FREQ	0
REVOKED	REVOKED	0
MVR_PTS	MVR_PTS	0
CAR_AGE	CAR_AGE	511
URBANICITY	URBANICITY	0

data_train <- data_train %>% 
   mutate_at(vars(c("CAR_AGE", "YOJ", "AGE", "INCOME", "HOME_VAL")), ~ifelse(is.na(.), median(., na.rm = TRUE), .))

# Count of missing Values per Variable
missing <- sapply(data_train, function(x) sum(is.na(x)))
kable(data.frame(Variable = names(missing), Missing = missing), caption = "Missing Values in Each Variable")

Missing Values in Each Variable
	Variable	Missing
TARGET_FLAG	TARGET_FLAG	0
TARGET_AMT	TARGET_AMT	0
KIDSDRIV	KIDSDRIV	0
AGE	AGE	0
HOMEKIDS	HOMEKIDS	0
YOJ	YOJ	0
INCOME	INCOME	0
PARENT1	PARENT1	0
HOME_VAL	HOME_VAL	0
MSTATUS	MSTATUS	0
SEX	SEX	0
EDUCATION	EDUCATION	0
JOB	JOB	0
TRAVTIME	TRAVTIME	0
CAR_USE	CAR_USE	0
BLUEBOOK	BLUEBOOK	0
TIF	TIF	0
CAR_TYPE	CAR_TYPE	0
RED_CAR	RED_CAR	0
OLDCLAIM	OLDCLAIM	0
CLM_FREQ	CLM_FREQ	0
REVOKED	REVOKED	0
MVR_PTS	MVR_PTS	0
CAR_AGE	CAR_AGE	0
URBANICITY	URBANICITY	0

3.2 Identifying Correlations

Positive correlations with TARGET_FLAG(increase crash risk): 1. TARGET_AMT: 0.53 (crashes tend to have costs - validates approach) 2. KIDSDRIV: 0.10 (teen drivers increase risk) 3. AGE: 0.10 (Contradicts theory that young drivers are riskier;May need non-linear age effects (U-shaped)) 4. HOMEKIDS: 0.12 (more kids = more risk) 5. YOJ: 0.07 (longer job tenure slightly increases risk)

Negative correlations (decrease crash risk): 1. TIF: -0.14 (longer time as customer means safe driver) 2. INCOME: -0.14 (higher income aids safer driving) 3. HOME_VAL: -0.18 (homeowners drive safer) 4. TRAVTIME: -0.05 (surprisingly, longer commute = slightly safer)

Strongest predictors among crash cost (TARGET_AMT): 1. BLUEBOOK: 0.08 (expensive cars = higher repair costs) 2. OLDCLAIM: 0.14 (past claims predict future costs) 3. CLM_FREQ: 0.22 (frequent claimers have higher costs) 4. INCOME: 0.05 (wealthy people may have more expensive repair)

Correlation Among Predictors: 1. INCOME <-> HOME_VAL: 0.54 (Strong - wealthy own homes) 2. OLDCLAIM <-> CLM_FREQ: 0.50 (Strong - more claims = higher total) 3. KIDSDRIV <-> HOMEKIDS: 0.46 (Moderate - kids drive family car) 4. AGE <-> HOMEKIDS: -0.45 (Moderate - older = fewer kids) 5. INCOME <-> BLUEBOOK: 0.42 (Moderate - wealthy buy expensive cars) 6. HOME_VAL <-> BLUEBOOK: 0.39 (Moderate - homeowners buy nicer cars)

Overall, most correlations are are not strong, which is good. Composite features will help with the 6 correlation pairs.

df_character_wide<-
data_train %>% select_if(function(col) is.numeric(col)==F | all(col==.$TARGET_AMT)) %>% 
  pivot_longer(cols = -TARGET_AMT,names_to="variable",values_to="value")
                 
df_character_wide %>% 
  ggplot(mapping = aes(x = value, y = TARGET_AMT))+
  geom_boxplot()+facet_wrap(.~variable, scales="free")+
  theme_bw()+
  theme(axis.text.x = element_text(angle = 90))

#Correlation matrix with target

numeric_vars <- data_train %>% select_if(is.numeric) 
cor_matrix <- cor(numeric_vars, use="pairwise.complete.obs")
corr_target <- cor_matrix[,"TARGET_FLAG"]
corr_target_sorted <- sort(corr_target, decreasing = TRUE)

kable(as.data.frame(corr_target_sorted), col.names = c("Correlation with Target"), digits = 2)

	Correlation with Target
TARGET_FLAG	1.00
TARGET_AMT	0.53
MVR_PTS	0.22
CLM_FREQ	0.22
OLDCLAIM	0.14
HOMEKIDS	0.12
KIDSDRIV	0.10
TRAVTIME	0.05
YOJ	-0.07
TIF	-0.08
CAR_AGE	-0.10
AGE	-0.10
BLUEBOOK	-0.10
INCOME	-0.14
HOME_VAL	-0.18

#Correlation heatmap
corrplot::corrplot(cor_matrix, method = "color", type = "upper", 
         tl.col = "black", tl.srt = 45, addCoef.col = "black",
         number.cex = 0.7, diag = FALSE)

# Correlation top 5 positive and negative
corr_target <- cor_matrix[, "TARGET_FLAG"]
corr_target <- corr_target[names(corr_target) != "TARGET_FLAG"]  # Remove self-correlation

top_pos <- sort(corr_target, decreasing = TRUE)[1:3]
top_neg <- sort(corr_target, decreasing = FALSE)[1:3]

combined_df <- data.frame(
  Feature = c(names(top_pos), names(top_neg)),
  Correlation = c(top_pos, top_neg)) %>%
  mutate(Direction = ifelse(Correlation > 0, "Positive", "Negative"))

ggplot(combined_df, aes(x = reorder(Feature, Correlation), y = Correlation, fill = Direction)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  scale_fill_manual(values = c("Positive" = "skyblue", "Negative" = "salmon")) +
  labs(title = "Top Features Correlated with High Crime",
       x = "Feature",
       y = "Correlation with Target",
       fill = "Direction") +
  theme_minimal()

# Correlation among predictors
melt_cor <- melt(cor_matrix)
filtered_cor <- melt_cor[melt_cor$Var1 != melt_cor$Var2 & as.numeric(melt_cor$Var1) < as.numeric(melt_cor$Var2), ]
sorted_cor <- filtered_cor[order(abs(filtered_cor$value), decreasing = TRUE), ]
head(sorted_cor)

##            Var1       Var2      value
## 112      INCOME   HOME_VAL  0.5430313
## 16  TARGET_FLAG TARGET_AMT  0.5342461
## 192    OLDCLAIM   CLM_FREQ  0.4951308
## 63     KIDSDRIV   HOMEKIDS  0.4640152
## 64          AGE   HOMEKIDS -0.4450739
## 142      INCOME   BLUEBOOK  0.4190438

# Box plots for key predictors vs TARGET_FLAG
par(mfrow=c(2,3))
boxplot(AGE ~ TARGET_FLAG, data=data_train, main="Age vs Crash", 
        xlab="Crash", ylab="Age", col=c("lightgreen", "salmon"))
boxplot(MVR_PTS ~ TARGET_FLAG, data=data_train, main="MVR Points vs Crash",
        xlab="Crash", ylab="MVR Points", col=c("lightgreen", "salmon"))
boxplot(CLM_FREQ ~ TARGET_FLAG, data=data_train, main="Claim Freq vs Crash",
        xlab="Crash", ylab="Claims", col=c("lightgreen", "salmon"))
boxplot(OLDCLAIM ~ TARGET_FLAG, data=data_train, main="Old Claims vs Crash",
        xlab="Crash", ylab="Old Claims", col=c("lightgreen", "salmon"))
boxplot(INCOME ~ TARGET_FLAG, data=data_train, main="Income vs Crash",
        xlab="Crash", ylab="Income", col=c("lightgreen", "salmon"))
boxplot(BLUEBOOK ~ TARGET_FLAG, data=data_train, main="Car Value(BLUEBOOK) vs Crash",
        xlab="Crash", ylab="Bluebook Value", col=c("lightgreen", "salmon"))

boxplot(HOME_VAL ~ TARGET_FLAG, data=data_train, main="Home Value vs Crash",
        xlab="Crash", ylab="Home Value", col=c("lightgreen", "salmon"))

3.3 Transform skewed predictors

# Log and sqrt transformation
data_transform <- data_train %>%
  mutate(
    LOG_TARGET_AMT = log(data_train$TARGET_AMT + 1),
    LOG_OLDCLAIM = log(data_train$OLDCLAIM + 1),
    SQRT_BLUEBOOK = sqrt(data_train$BLUEBOOK),
    LOG_HOME_VAL = log(data_train$HOME_VAL + 1),
    LOG_INCOME = log(data_train$INCOME + 1),
    LOG_TRAVTIME = log(data_train$TRAVTIME + 1),
    LOG_TIF = log(data_train$TIF + 1),
    LOG_CAR_AGE = log(data_train$CAR_AGE + 1)
  )

data_transform|>
  dplyr::select(where(is.numeric)) |>
  pivot_longer(cols = everything(), names_to = "Feature", values_to = "Value") |>
  filter(!is.na(Value)) |>
  ggplot(aes(x = Value)) +
  geom_histogram(bins = 30, fill = "skyblue", color = "black") +
  facet_wrap(~Feature, scales = "free") +
  labs(title = "Histograms of Numerical Features", x = NULL, y = "Frequency") +
  theme_minimal()

### 3.4 Adding new features

# 4. Create new features
  # Age groups
  data_transform$AGE_YOUNG <- as.numeric(data_transform$AGE < 25)
  data_transform$AGE_OLD <- as.numeric(data_transform$AGE > 65)
  
  # High risk indicators
  data_transform$HIGH_RISK <- as.numeric(data_transform$MVR_PTS > 3 | data_transform$CLM_FREQ > 2)
  data_transform$REVOKED_NUM <- as.numeric(data_transform$REVOKED == "Yes")
  
  # Car value to income ratio
  data_transform$CAR_INCOME_RATIO <- data_transform$BLUEBOOK / (data_transform$INCOME + 1)
  
  # Homeowner flag
  data_transform$HOMEOWNER <- as.numeric(data_transform$HOME_VAL > 0)
  
  # Total driving exposure
  data_transform$TOTAL_DRIVERS <- data_transform$KIDSDRIV + 1

3.5 Split Data for Validation

To ensure objective model evaluation and prevent overfitting, we split each dataset into training (80%) and testing (20%) sets using a consistent random seed (set.seed(123)) for reproducibility.

# Split data for validation
set.seed(123)

train_idx <- createDataPartition(y = data_train$TARGET_FLAG, p = 0.8, list = FALSE)

# Create train/test splits for each dataset
train <- data_train[train_idx, ]
test <- data_train[-train_idx, ]

# Transformed predictors dataset
train_transformed <- data_transform[train_idx, ]
test_transformed <- data_transform[-train_idx, ]

3.5 Apply Standardization

Standardization could be helpful in simple Logistic Regression, but it is not necessary. Since there is a difference in spreads among the predictors (as shown in the EDA), standardizing the data might be useful. This would be especially important if we ever decided to use regularization.

# Standardize the training and test sets (using training dataset standardization)
set.seed(123)

# Apply standardization from the train set to both train and test sets
train_features <- train |>
  dplyr::select(-c(TARGET_FLAG))
train_target <- train$TARGET_FLAG
preproc_params <- preProcess(train_features, method = c("center", "scale"))
train_standardized <- predict(preproc_params, train_features)
# add back in target variable
train_standardized$TARGET_FLAG <- as.factor(train_target)

test_features <- test[, -which(names(test) == "TARGET_FLAG")]
test_target <- test$TARGET_FLAG
test_standardized <- predict(preproc_params, test_features)
# add back in target variable
test_standardized$TARGET_FLAG <- as.factor(test_target)
# test_standardized <- cbind(test_standardized, target_var = test_target)

# Transformed predictors dataset
train_transformed_features <- train_transformed[, -which(names(train_transformed) == "TARGET_FLAG")]
train_transformed_target <- train_transformed$TARGET_FLAG
preproc_params_transformed <- preProcess(train_transformed_features, method = c("center", "scale"))
train_transformed_standardized <- predict(preproc_params_transformed, train_transformed_features)
train_transformed_standardized$TARGET_FLAG <- as.factor(train_transformed_target)
# train_transformed_standardized <- cbind(train_transformed_standardized, target_var = train_transformed_target)

test_transformed_features <- test_transformed[, -which(names(test_transformed) == "TARGET_FLAG")]
test_transformed_target <- test_transformed$TARGET_FLAG
test_transformed_standardized <- predict(preproc_params_transformed, test_transformed_features)
test_transformed_standardized$TARGET_FLAG <- as.factor(test_transformed_target)
# test_transformed_standardized <- cbind(test_transformed_standardized, target_var = test_transformed_target)


# train_transformed_standardized <- predict(preproc_params_transformed, train_transformed)
# test_transformed_standardized <- predict(preproc_params_transformed, test_transformed)

3.6 Fix Data Imbalance

There are 1508 out of 5714 records in the training data set that have been in an accident. We need to correct the imbalance in the dataset by over sampling .

# Calculate class weights for imbalanced data
crash_weight <- nrow(data_train) / (2 * sum(data_train$TARGET_FLAG == 1))
no_crash_weight <- nrow(data_train) / (2 * sum(data_train$TARGET_FLAG == 0))
weights <- ifelse(data_train$TARGET_FLAG == 1, crash_weight, no_crash_weight)

cat("Class weights calculated:\n")

## Class weights calculated:

cat("  Crash (minority class):", round(crash_weight, 2), "\n")

##   Crash (minority class): 1.9

cat("  No Crash (majority class):", round(no_crash_weight, 2), "\n\n")

##   No Crash (majority class): 0.68

# Only apply SMOTE to training sets
# These apply to LOGISTIC REGRESSION MODELS where TARGET_FLAG is the target
# Original dataset
train_smote <- train |>
  dplyr::select(-c(TARGET_AMT))
train_smote$TARGET_FLAG <- as.factor(train_smote$TARGET_FLAG)
train_smote <- smotenc(train_smote, var = "TARGET_FLAG", over_ratio = 1) 

# Transformed dataset
train_transformed_smote <- train_transformed |>
  dplyr::select(-c(TARGET_AMT))
train_transformed_smote$TARGET_FLAG <- as.factor(train_transformed_smote$TARGET_FLAG)
train_transformed_smote <- smotenc(train_transformed_smote, var = "TARGET_FLAG", over_ratio = 1)

# Standardized dataset
train_standardized_smote <- train_standardized |>
  dplyr::select(-c(TARGET_AMT))
train_standardized_smote$TARGET_FLAG <- as.factor(train_standardized_smote$TARGET_FLAG)
train_standardized_smote <- smotenc(train_standardized_smote, var = "TARGET_FLAG", over_ratio = 1)

# Standardized transformed dataset
train_transformed_standardized_smote <- train_transformed_standardized |>
  dplyr::select(-c(TARGET_AMT))
train_transformed_standardized_smote$TARGET_FLAG <- as.factor(train_transformed_standardized_smote$TARGET_FLAG)
train_transformed_standardized_smote <- smotenc(train_transformed_standardized_smote, var = "TARGET_FLAG", over_ratio = 1)

4 Build Models

4.1 Model 1: Baseline/Original Model (All Continuous Predictors)

Includes all original numeric predictors (no transformations or binning).

summary(data_train)

##   TARGET_FLAG       TARGET_AMT        KIDSDRIV           AGE       
##  Min.   :0.0000   Min.   :     0   Min.   :0.0000   Min.   :16.00  
##  1st Qu.:0.0000   1st Qu.:     0   1st Qu.:0.0000   1st Qu.:39.00  
##  Median :0.0000   Median :     0   Median :0.0000   Median :45.00  
##  Mean   :0.2638   Mean   :  1504   Mean   :0.1711   Mean   :44.79  
##  3rd Qu.:1.0000   3rd Qu.:  1036   3rd Qu.:0.0000   3rd Qu.:51.00  
##  Max.   :1.0000   Max.   :107586   Max.   :4.0000   Max.   :81.00  
##                                                                    
##     HOMEKIDS           YOJ            INCOME       PARENT1       HOME_VAL     
##  Min.   :0.0000   Min.   : 0.00   Min.   :     0   No :7084   Min.   :     0  
##  1st Qu.:0.0000   1st Qu.: 9.00   1st Qu.: 29707   Yes:1077   1st Qu.:     0  
##  Median :0.0000   Median :11.00   Median : 54028              Median :161160  
##  Mean   :0.7212   Mean   :10.53   Mean   : 61469              Mean   :155225  
##  3rd Qu.:1.0000   3rd Qu.:13.00   3rd Qu.: 83304              3rd Qu.:233352  
##  Max.   :5.0000   Max.   :23.00   Max.   :367030              Max.   :885282  
##                                                                               
##  MSTATUS    SEX            EDUCATION              JOB          TRAVTIME     
##  No :3267   F:4375   Bachelors  :2242   Blue Collar :1825   Min.   :  5.00  
##  Yes:4894   M:3786   High School:3533   Clerical    :1271   1st Qu.: 22.00  
##                      Masters    :1658   Professional:1117   Median : 33.00  
##                      PhD        : 728   Manager     : 988   Mean   : 33.49  
##                                         Lawyer      : 835   3rd Qu.: 44.00  
##                                         Student     : 712   Max.   :142.00  
##                                         (Other)     :1413                   
##        CAR_USE        BLUEBOOK          TIF                CAR_TYPE   
##  Commercial:3029   Min.   : 1500   Min.   : 1.000   Minivan    :2145  
##  Private   :5132   1st Qu.: 9280   1st Qu.: 1.000   Panel Truck: 676  
##                    Median :14440   Median : 4.000   Pickup     :1389  
##                    Mean   :15710   Mean   : 5.351   Sports Car : 907  
##                    3rd Qu.:20850   3rd Qu.: 7.000   SUV        :2294  
##                    Max.   :69740   Max.   :25.000   Van        : 750  
##                                                                       
##  RED_CAR       OLDCLAIM        CLM_FREQ      REVOKED       MVR_PTS      
##  no :5783   Min.   :    0   Min.   :0.0000   No :7161   Min.   : 0.000  
##  yes:2378   1st Qu.:    0   1st Qu.:0.0000   Yes:1000   1st Qu.: 0.000  
##             Median :    0   Median :0.0000              Median : 1.000  
##             Mean   : 4037   Mean   :0.7986              Mean   : 1.696  
##             3rd Qu.: 4636   3rd Qu.:2.0000              3rd Qu.: 3.000  
##             Max.   :57037   Max.   :5.0000              Max.   :13.000  
##                                                                         
##     CAR_AGE                     URBANICITY  
##  Min.   : 0.000   Highly Rural/ Rural:1669  
##  1st Qu.: 4.000   Highly Urban/ Urban:6492  
##  Median : 8.000                             
##  Mean   : 8.309                             
##  3rd Qu.:12.000                             
##  Max.   :28.000                             
##

# Original Model - using the full dataset to train the model
set.seed(123)
logit_model1 <- glm(TARGET_FLAG ~ AGE + BLUEBOOK + CAR_AGE + CAR_TYPE + CAR_USE + 
                    CLM_FREQ + EDUCATION + HOMEKIDS + HOME_VAL + INCOME + JOB + 
                    KIDSDRIV + MSTATUS + MVR_PTS + OLDCLAIM + PARENT1 + RED_CAR + 
                    REVOKED + SEX + TIF + TRAVTIME + URBANICITY + YOJ,
                    data=train, family=binomial)
summary(logit_model1)

## 
## Call:
## glm(formula = TARGET_FLAG ~ AGE + BLUEBOOK + CAR_AGE + CAR_TYPE + 
##     CAR_USE + CLM_FREQ + EDUCATION + HOMEKIDS + HOME_VAL + INCOME + 
##     JOB + KIDSDRIV + MSTATUS + MVR_PTS + OLDCLAIM + PARENT1 + 
##     RED_CAR + REVOKED + SEX + TIF + TRAVTIME + URBANICITY + YOJ, 
##     family = binomial, data = train)
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -3.234e+00  3.777e-01  -8.563  < 2e-16 ***
## AGE                           -3.471e-03  4.506e-03  -0.770 0.441071    
## BLUEBOOK                      -2.083e-05  5.921e-06  -3.519 0.000434 ***
## CAR_AGE                       -3.771e-03  8.397e-03  -0.449 0.653399    
## CAR_TYPEPanel Truck            5.439e-01  1.815e-01   2.996 0.002732 ** 
## CAR_TYPEPickup                 5.482e-01  1.118e-01   4.906 9.30e-07 ***
## CAR_TYPESports Car             1.066e+00  1.446e-01   7.371 1.69e-13 ***
## CAR_TYPESUV                    7.888e-01  1.239e-01   6.367 1.93e-10 ***
## CAR_TYPEVan                    7.085e-01  1.397e-01   5.072 3.94e-07 ***
## CAR_USEPrivate                -7.718e-01  9.755e-02  -7.912 2.54e-15 ***
## CLM_FREQ                       1.721e-01  3.195e-02   5.387 7.17e-08 ***
## EDUCATIONHigh School           3.968e-01  9.731e-02   4.078 4.55e-05 ***
## EDUCATIONMasters               6.443e-02  1.566e-01   0.411 0.680713    
## EDUCATIONPhD                   1.979e-01  1.971e-01   1.004 0.315565    
## HOMEKIDS                       2.568e-02  4.175e-02   0.615 0.538423    
## HOME_VAL                      -1.379e-06  3.793e-07  -3.636 0.000277 ***
## INCOME                        -2.400e-06  1.192e-06  -2.014 0.044012 *  
## JOBBlue Collar                 3.790e-01  2.076e-01   1.826 0.067885 .  
## JOBClerical                    4.639e-01  2.200e-01   2.109 0.034962 *  
## JOBDoctor                     -2.750e-01  2.897e-01  -0.949 0.342490    
## JOBHome Maker                  3.055e-01  2.354e-01   1.298 0.194345    
## JOBLawyer                      1.716e-01  1.910e-01   0.899 0.368896    
## JOBManager                    -4.735e-01  1.925e-01  -2.460 0.013891 *  
## JOBProfessional                2.562e-01  1.999e-01   1.282 0.199948    
## JOBStudent                     2.814e-01  2.407e-01   1.169 0.242427    
## KIDSDRIV                       3.953e-01  6.932e-02   5.702 1.18e-08 ***
## MSTATUSYes                    -4.928e-01  9.386e-02  -5.250 1.52e-07 ***
## MVR_PTS                        1.121e-01  1.522e-02   7.368 1.74e-13 ***
## OLDCLAIM                      -1.114e-05  4.394e-06  -2.535 0.011233 *  
## PARENT1Yes                     4.774e-01  1.225e-01   3.896 9.78e-05 ***
## RED_CARyes                    -1.710e-02  9.673e-02  -0.177 0.859715    
## REVOKEDYes                     7.905e-01  1.026e-01   7.706 1.30e-14 ***
## SEXM                           6.820e-02  1.256e-01   0.543 0.587101    
## TIF                           -5.639e-02  8.139e-03  -6.928 4.27e-12 ***
## TRAVTIME                       1.490e-02  2.105e-03   7.077 1.47e-12 ***
## URBANICITYHighly Urban/ Urban  2.449e+00  1.263e-01  19.386  < 2e-16 ***
## YOJ                           -1.612e-02  9.637e-03  -1.673 0.094419 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7535.7  on 6528  degrees of freedom
## Residual deviance: 5828.5  on 6492  degrees of freedom
## AIC: 5902.5
## 
## Number of Fisher Scoring iterations: 5

4.3 Model 1B: Same as Model 1 but address class imbalance (SMOTE) and standardize the dataset

set.seed(123)
# Model 1b: Same as Model 1 but standardize the dataset and address imbalance

cat("\nModel 1b: Full model with standardized dataset and SMOTE\n")

## 
## Model 1b: Full model with standardized dataset and SMOTE

logit_model1b <- glm(TARGET_FLAG ~ AGE + BLUEBOOK + CAR_AGE + CAR_TYPE + CAR_USE + 
                     CLM_FREQ + EDUCATION + HOMEKIDS + HOME_VAL + INCOME + JOB + 
                     KIDSDRIV + MSTATUS + MVR_PTS + OLDCLAIM + PARENT1 + RED_CAR + 
                     REVOKED + SEX + TIF + TRAVTIME + URBANICITY + YOJ,
                     data=train_standardized_smote, family=binomial)
summary(logit_model1b)

## 
## Call:
## glm(formula = TARGET_FLAG ~ AGE + BLUEBOOK + CAR_AGE + CAR_TYPE + 
##     CAR_USE + CLM_FREQ + EDUCATION + HOMEKIDS + HOME_VAL + INCOME + 
##     JOB + KIDSDRIV + MSTATUS + MVR_PTS + OLDCLAIM + PARENT1 + 
##     RED_CAR + REVOKED + SEX + TIF + TRAVTIME + URBANICITY + YOJ, 
##     family = binomial, data = train_standardized_smote)
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -3.0579954  0.2053816 -14.889  < 2e-16 ***
## AGE                           -0.0176269  0.0307575  -0.573 0.566583    
## BLUEBOOK                      -0.1813206  0.0377048  -4.809 1.52e-06 ***
## CAR_AGE                       -0.0004521  0.0366749  -0.012 0.990165    
## CAR_TYPEPanel Truck            0.4665891  0.1374236   3.395 0.000686 ***
## CAR_TYPEPickup                 0.5295759  0.0846833   6.254 4.01e-10 ***
## CAR_TYPESports Car             1.1476263  0.1102966  10.405  < 2e-16 ***
## CAR_TYPESUV                    0.9811313  0.0931467  10.533  < 2e-16 ***
## CAR_TYPEVan                    0.6437475  0.1059321   6.077 1.22e-09 ***
## CAR_USEPrivate                -0.8285477  0.0782844 -10.584  < 2e-16 ***
## CLM_FREQ                       0.1732367  0.0304592   5.687 1.29e-08 ***
## EDUCATIONHigh School           0.4964468  0.0763017   6.506 7.70e-11 ***
## EDUCATIONMasters              -0.0377586  0.1217088  -0.310 0.756381    
## EDUCATIONPhD                  -0.0675001  0.1514225  -0.446 0.655761    
## HOMEKIDS                       0.0032063  0.0377349   0.085 0.932286    
## HOME_VAL                      -0.1976250  0.0373164  -5.296 1.18e-07 ***
## INCOME                        -0.1032652  0.0430294  -2.400 0.016401 *  
## JOBBlue Collar                 0.1621475  0.1582736   1.024 0.305611    
## JOBClerical                    0.2663657  0.1693948   1.572 0.115845    
## JOBDoctor                     -0.2434161  0.2088290  -1.166 0.243766    
## JOBHome Maker                  0.0184641  0.1813121   0.102 0.918887    
## JOBLawyer                      0.1589797  0.1393099   1.141 0.253789    
## JOBManager                    -0.8073681  0.1453024  -5.556 2.75e-08 ***
## JOBProfessional                0.0693833  0.1537225   0.451 0.651734    
## JOBStudent                    -0.0364654  0.1868510  -0.195 0.845270    
## KIDSDRIV                       0.2055106  0.0302731   6.789 1.13e-11 ***
## MSTATUSYes                    -0.4262216  0.0713781  -5.971 2.35e-09 ***
## MVR_PTS                        0.2274219  0.0272508   8.346  < 2e-16 ***
## OLDCLAIM                      -0.0601855  0.0313059  -1.922 0.054543 .  
## PARENT1Yes                     0.5486977  0.0975523   5.625 1.86e-08 ***
## RED_CARyes                     0.0896041  0.0741872   1.208 0.227120    
## REVOKEDYes                     0.3976006  0.0840508   4.730 2.24e-06 ***
## SEXM                           0.0694310  0.0954861   0.727 0.467145    
## TIF                           -0.3156493  0.0267454 -11.802  < 2e-16 ***
## TRAVTIME                       0.2559907  0.0273236   9.369  < 2e-16 ***
## URBANICITYHighly Urban/ Urban  2.9792514  0.0996296  29.903  < 2e-16 ***
## YOJ                           -0.0694556  0.0308501  -2.251 0.024361 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 13325.1  on 9611  degrees of freedom
## Residual deviance:  9854.5  on 9575  degrees of freedom
## AIC: 9928.5
## 
## Number of Fisher Scoring iterations: 5

4.3 Model 2 + 2B: Stepwise Selection (Both directions) on Model 1 + 1B

set.seed(123)

# Model 2: Stepwise selection
logit_model2 <- stepAIC(logit_model1, direction="both", trace=0)
summary(logit_model2)

## 
## Call:
## glm(formula = TARGET_FLAG ~ BLUEBOOK + CAR_TYPE + CAR_USE + CLM_FREQ + 
##     EDUCATION + HOME_VAL + INCOME + JOB + KIDSDRIV + MSTATUS + 
##     MVR_PTS + OLDCLAIM + PARENT1 + REVOKED + TIF + TRAVTIME + 
##     URBANICITY + YOJ, family = binomial, data = train)
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -3.364e+00  3.079e-01 -10.923  < 2e-16 ***
## BLUEBOOK                      -2.252e-05  5.313e-06  -4.237 2.26e-05 ***
## CAR_TYPEPanel Truck            5.825e-01  1.692e-01   3.444 0.000573 ***
## CAR_TYPEPickup                 5.475e-01  1.117e-01   4.902 9.48e-07 ***
## CAR_TYPESports Car             1.022e+00  1.195e-01   8.554  < 2e-16 ***
## CAR_TYPESUV                    7.503e-01  9.578e-02   7.834 4.72e-15 ***
## CAR_TYPEVan                    7.318e-01  1.343e-01   5.449 5.06e-08 ***
## CAR_USEPrivate                -7.668e-01  9.741e-02  -7.872 3.49e-15 ***
## CLM_FREQ                       1.721e-01  3.194e-02   5.388 7.14e-08 ***
## EDUCATIONHigh School           4.149e-01  8.992e-02   4.615 3.94e-06 ***
## EDUCATIONMasters               3.847e-02  1.506e-01   0.255 0.798368    
## EDUCATIONPhD                   1.653e-01  1.927e-01   0.858 0.390795    
## HOME_VAL                      -1.407e-06  3.775e-07  -3.726 0.000194 ***
## INCOME                        -2.377e-06  1.189e-06  -1.999 0.045577 *  
## JOBBlue Collar                 3.778e-01  2.075e-01   1.820 0.068690 .  
## JOBClerical                    4.684e-01  2.199e-01   2.130 0.033146 *  
## JOBDoctor                     -2.869e-01  2.890e-01  -0.993 0.320799    
## JOBHome Maker                  2.911e-01  2.339e-01   1.245 0.213302    
## JOBLawyer                      1.576e-01  1.905e-01   0.827 0.408040    
## JOBManager                    -4.881e-01  1.922e-01  -2.540 0.011092 *  
## JOBProfessional                2.445e-01  1.997e-01   1.224 0.220846    
## JOBStudent                     2.912e-01  2.397e-01   1.215 0.224414    
## KIDSDRIV                       4.117e-01  6.267e-02   6.570 5.04e-11 ***
## MSTATUSYes                    -4.694e-01  9.018e-02  -5.205 1.94e-07 ***
## MVR_PTS                        1.126e-01  1.521e-02   7.405 1.31e-13 ***
## OLDCLAIM                      -1.125e-05  4.392e-06  -2.561 0.010440 *  
## PARENT1Yes                     5.434e-01  1.050e-01   5.174 2.29e-07 ***
## REVOKEDYes                     7.945e-01  1.025e-01   7.749 9.24e-15 ***
## TIF                           -5.630e-02  8.136e-03  -6.920 4.52e-12 ***
## TRAVTIME                       1.483e-02  2.103e-03   7.050 1.79e-12 ***
## URBANICITYHighly Urban/ Urban  2.451e+00  1.264e-01  19.400  < 2e-16 ***
## YOJ                           -1.577e-02  9.362e-03  -1.684 0.092166 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7535.7  on 6528  degrees of freedom
## Residual deviance: 5830.4  on 6497  degrees of freedom
## AIC: 5894.4
## 
## Number of Fisher Scoring iterations: 5

# Model 2b: Stepwise with standardized data and SMOTE
cat("\nModel 2b: Stepwise selection with standardized data and SMOTE\n")

## 
## Model 2b: Stepwise selection with standardized data and SMOTE

logit_model2b <- stepAIC(logit_model1b, direction="both", trace=0)
summary(logit_model2b)

## 
## Call:
## glm(formula = TARGET_FLAG ~ BLUEBOOK + CAR_TYPE + CAR_USE + CLM_FREQ + 
##     EDUCATION + HOME_VAL + INCOME + JOB + KIDSDRIV + MSTATUS + 
##     MVR_PTS + OLDCLAIM + PARENT1 + RED_CAR + REVOKED + TIF + 
##     TRAVTIME + URBANICITY + YOJ, family = binomial, data = train_standardized_smote)
## 
## Coefficients:
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -3.02758    0.19878 -15.230  < 2e-16 ***
## BLUEBOOK                      -0.19278    0.03510  -5.493 3.96e-08 ***
## CAR_TYPEPanel Truck            0.49907    0.13070   3.818 0.000134 ***
## CAR_TYPEPickup                 0.52829    0.08462   6.243 4.29e-10 ***
## CAR_TYPESports Car             1.10575    0.09609  11.508  < 2e-16 ***
## CAR_TYPESUV                    0.94285    0.07720  12.214  < 2e-16 ***
## CAR_TYPEVan                    0.66146    0.10336   6.400 1.56e-10 ***
## CAR_USEPrivate                -0.82731    0.07823 -10.575  < 2e-16 ***
## CLM_FREQ                       0.17364    0.03045   5.703 1.18e-08 ***
## EDUCATIONHigh School           0.49753    0.07072   7.035 1.99e-12 ***
## EDUCATIONMasters              -0.03934    0.11726  -0.335 0.737274    
## EDUCATIONPhD                  -0.07523    0.14824  -0.507 0.611809    
## HOME_VAL                      -0.19857    0.03717  -5.341 9.22e-08 ***
## INCOME                        -0.10247    0.04292  -2.387 0.016971 *  
## JOBBlue Collar                 0.16494    0.15823   1.042 0.297195    
## JOBClerical                    0.27166    0.16927   1.605 0.108516    
## JOBDoctor                     -0.24514    0.20838  -1.176 0.239449    
## JOBHome Maker                  0.00675    0.18050   0.037 0.970170    
## JOBLawyer                      0.15454    0.13900   1.112 0.266251    
## JOBManager                    -0.81148    0.14511  -5.592 2.25e-08 ***
## JOBProfessional                0.06756    0.15358   0.440 0.660025    
## JOBStudent                    -0.03395    0.18624  -0.182 0.855369    
## KIDSDRIV                       0.20611    0.02727   7.557 4.12e-14 ***
## MSTATUSYes                    -0.42137    0.06907  -6.101 1.06e-09 ***
## MVR_PTS                        0.22736    0.02724   8.347  < 2e-16 ***
## OLDCLAIM                      -0.06084    0.03129  -1.944 0.051864 .  
## PARENT1Yes                     0.56672    0.08477   6.686 2.30e-11 ***
## RED_CARyes                     0.11306    0.06590   1.716 0.086208 .  
## REVOKEDYes                     0.39894    0.08402   4.748 2.05e-06 ***
## TIF                           -0.31559    0.02673 -11.805  < 2e-16 ***
## TRAVTIME                       0.25575    0.02731   9.364  < 2e-16 ***
## URBANICITYHighly Urban/ Urban  2.98075    0.09964  29.916  < 2e-16 ***
## YOJ                           -0.07099    0.02992  -2.373 0.017658 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 13325.1  on 9611  degrees of freedom
## Residual deviance:  9855.4  on 9579  degrees of freedom
## AIC: 9921.4
## 
## Number of Fisher Scoring iterations: 5

4.3 Model 3: SMOTE + Transformed

set.seed(123)
# Model 3: SMOTE + Tranformed dataset
'
    LOG_TARGET_AMT = log(data_train$TARGET_AMT + 1),
    LOG_OLDCLAIM = log(data_train$OLDCLAIM + 1),
    SQRT_BLUEBOOK = sqrt(data_train$BLUEBOOK),
    LOG_HOME_VAL = log(data_train$HOME_VAL + 1),
    LOG_INCOME = log(data_train$INCOME + 1),
    LOG_TRAVTIME = log(data_train$TRAVTIME + 1),
    LOG_TIF = log(data_train$TIF + 1),
    LOG_CAR_AGE = log(data_train$CAR_AGE + 1)
'

## [1] "\n    LOG_TARGET_AMT = log(data_train$TARGET_AMT + 1),\n    LOG_OLDCLAIM = log(data_train$OLDCLAIM + 1),\n    SQRT_BLUEBOOK = sqrt(data_train$BLUEBOOK),\n    LOG_HOME_VAL = log(data_train$HOME_VAL + 1),\n    LOG_INCOME = log(data_train$INCOME + 1),\n    LOG_TRAVTIME = log(data_train$TRAVTIME + 1),\n    LOG_TIF = log(data_train$TIF + 1),\n    LOG_CAR_AGE = log(data_train$CAR_AGE + 1)\n"

cat("\nModel 3: Transformed dataset model\n")

## 
## Model 3: Transformed dataset model

logit_model3 <- glm(TARGET_FLAG ~ AGE + SQRT_BLUEBOOK + LOG_CAR_AGE + CAR_TYPE + CAR_USE + 
                     CLM_FREQ + EDUCATION + HOMEKIDS + LOG_HOME_VAL + LOG_INCOME + JOB + 
                     KIDSDRIV + MSTATUS + MVR_PTS + LOG_OLDCLAIM + PARENT1 + RED_CAR + 
                     REVOKED + SEX + LOG_TIF + LOG_TRAVTIME + URBANICITY + YOJ,
                     data=train_transformed_smote, family=binomial)
summary(logit_model3)

## 
## Call:
## glm(formula = TARGET_FLAG ~ AGE + SQRT_BLUEBOOK + LOG_CAR_AGE + 
##     CAR_TYPE + CAR_USE + CLM_FREQ + EDUCATION + HOMEKIDS + LOG_HOME_VAL + 
##     LOG_INCOME + JOB + KIDSDRIV + MSTATUS + MVR_PTS + LOG_OLDCLAIM + 
##     PARENT1 + RED_CAR + REVOKED + SEX + LOG_TIF + LOG_TRAVTIME + 
##     URBANICITY + YOJ, family = binomial, data = train_transformed_smote)
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -2.058592   0.376843  -5.463 4.69e-08 ***
## AGE                           -0.009853   0.003647  -2.702 0.006894 ** 
## SQRT_BLUEBOOK                 -0.005893   0.001043  -5.648 1.62e-08 ***
## LOG_CAR_AGE                    0.020701   0.041502   0.499 0.617923    
## CAR_TYPEPanel Truck            0.463450   0.133249   3.478 0.000505 ***
## CAR_TYPEPickup                 0.531554   0.085717   6.201 5.60e-10 ***
## CAR_TYPESports Car             1.067658   0.110040   9.702  < 2e-16 ***
## CAR_TYPESUV                    1.037582   0.091323  11.362  < 2e-16 ***
## CAR_TYPEVan                    0.691633   0.106392   6.501 7.99e-11 ***
## CAR_USEPrivate                -0.812991   0.078054 -10.416  < 2e-16 ***
## CLM_FREQ                       0.042180   0.040130   1.051 0.293213    
## EDUCATIONHigh School           0.535802   0.076237   7.028 2.09e-12 ***
## EDUCATIONMasters              -0.020565   0.116614  -0.176 0.860016    
## EDUCATIONPhD                  -0.068126   0.141665  -0.481 0.630591    
## HOMEKIDS                      -0.053894   0.034204  -1.576 0.115106    
## LOG_HOME_VAL                  -0.041678   0.006206  -6.716 1.87e-11 ***
## LOG_INCOME                    -0.133717   0.016314  -8.196 2.48e-16 ***
## JOBBlue Collar                 0.348830   0.156485   2.229 0.025803 *  
## JOBClerical                    0.459772   0.164698   2.792 0.005245 ** 
## JOBDoctor                     -0.468625   0.217761  -2.152 0.031396 *  
## JOBHome Maker                 -0.280449   0.187336  -1.497 0.134384    
## JOBLawyer                      0.242489   0.138222   1.754 0.079372 .  
## JOBManager                    -0.630465   0.144069  -4.376 1.21e-05 ***
## JOBProfessional                0.238325   0.149981   1.589 0.112051    
## JOBStudent                    -0.383699   0.194102  -1.977 0.048066 *  
## KIDSDRIV                       0.428999   0.060199   7.126 1.03e-12 ***
## MSTATUSYes                    -0.409284   0.074451  -5.497 3.86e-08 ***
## MVR_PTS                        0.098907   0.012864   7.688 1.49e-14 ***
## LOG_OLDCLAIM                   0.032501   0.011326   2.870 0.004109 ** 
## PARENT1Yes                     0.492488   0.096986   5.078 3.82e-07 ***
## RED_CARyes                     0.096040   0.074814   1.284 0.199241    
## REVOKEDYes                     0.415556   0.074820   5.554 2.79e-08 ***
## SEXM                           0.049699   0.094234   0.527 0.597920    
## LOG_TIF                       -0.413450   0.036558 -11.309  < 2e-16 ***
## LOG_TRAVTIME                   0.504264   0.048450  10.408  < 2e-16 ***
## URBANICITYHighly Urban/ Urban  2.939004   0.099885  29.424  < 2e-16 ***
## YOJ                            0.031439   0.009987   3.148 0.001644 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 13325.1  on 9611  degrees of freedom
## Residual deviance:  9717.5  on 9575  degrees of freedom
## AIC: 9791.5
## 
## Number of Fisher Scoring iterations: 5

4.3 Model 4: SMOTE + Add New Features

# Model 4 

# For reference:
  # # Age groups
  # data_transform$AGE_YOUNG <- as.numeric(data_transform$AGE < 25)
  # data_transform$AGE_OLD <- as.numeric(data_transform$AGE > 65)
  # 
  # # High risk indicators
  # data_transform$HIGH_RISK <- as.numeric(data_transform$MVR_PTS > 3 | data_transform$CLM_FREQ > 2)
  # data_transform$REVOKED_NUM <- as.numeric(data_transform$REVOKED == "Yes")
  # 
  # # Car value to income ratio
  # data_transform$CAR_INCOME_RATIO <- data_transform$BLUEBOOK / (data_transform$INCOME + 1)
  # 
  # # Homeowner flag
  # data_transform$HOMEOWNER <- as.numeric(data_transform$HOME_VAL > 0)
  # 
  # # Total driving exposure
  # data_transform$TOTAL_DRIVERS <- data_transform$KIDSDRIV + 1

set.seed(123)

cat("\nModel 4: New Features Model\n")

## 
## Model 4: New Features Model

# Remove AGE, MVR_PTS, CLM_FREQ, KIDSDRIV, BLUEBOOK, INCOME, HOME_VAL and REVOKED
logit_model4 <- glm(TARGET_FLAG ~ CAR_AGE + CAR_TYPE + CAR_USE + 
                     EDUCATION + HOMEKIDS + JOB + 
                     MSTATUS + OLDCLAIM + PARENT1 + RED_CAR + 
                     SEX + TIF + TRAVTIME + URBANICITY + YOJ +
                     AGE_YOUNG + AGE_OLD + HIGH_RISK + REVOKED_NUM + CAR_INCOME_RATIO +
                     HOMEOWNER + TOTAL_DRIVERS,
                     data=train_transformed_smote, family=binomial)
summary(logit_model4)

## 
## Call:
## glm(formula = TARGET_FLAG ~ CAR_AGE + CAR_TYPE + CAR_USE + EDUCATION + 
##     HOMEKIDS + JOB + MSTATUS + OLDCLAIM + PARENT1 + RED_CAR + 
##     SEX + TIF + TRAVTIME + URBANICITY + YOJ + AGE_YOUNG + AGE_OLD + 
##     HIGH_RISK + REVOKED_NUM + CAR_INCOME_RATIO + HOMEOWNER + 
##     TOTAL_DRIVERS, family = binomial, data = train_transformed_smote)
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -4.024e+00  2.516e-01 -15.992  < 2e-16 ***
## CAR_AGE                        4.282e-03  6.442e-03   0.665 0.506270    
## CAR_TYPEPanel Truck            1.260e-01  1.220e-01   1.033 0.301575    
## CAR_TYPEPickup                 6.324e-01  8.436e-02   7.497 6.51e-14 ***
## CAR_TYPESports Car             1.348e+00  1.049e-01  12.849  < 2e-16 ***
## CAR_TYPESUV                    1.286e+00  8.576e-02  14.996  < 2e-16 ***
## CAR_TYPEVan                    5.295e-01  1.023e-01   5.175 2.28e-07 ***
## CAR_USEPrivate                -7.958e-01  7.694e-02 -10.344  < 2e-16 ***
## EDUCATIONHigh School           5.947e-01  7.463e-02   7.968 1.61e-15 ***
## EDUCATIONMasters              -6.885e-02  1.176e-01  -0.586 0.558091    
## EDUCATIONPhD                  -2.133e-01  1.416e-01  -1.507 0.131852    
## HOMEKIDS                       1.192e-02  3.056e-02   0.390 0.696655    
## JOBBlue Collar                 3.680e-01  1.548e-01   2.377 0.017468 *  
## JOBClerical                    5.403e-01  1.629e-01   3.317 0.000909 ***
## JOBDoctor                     -5.845e-01  2.152e-01  -2.715 0.006618 ** 
## JOBHome Maker                  1.344e-01  1.756e-01   0.765 0.444091    
## JOBLawyer                      1.904e-01  1.368e-01   1.392 0.163924    
## JOBManager                    -6.844e-01  1.426e-01  -4.798 1.60e-06 ***
## JOBProfessional                2.264e-01  1.485e-01   1.525 0.127226    
## JOBStudent                     2.379e-02  1.843e-01   0.129 0.897273    
## MSTATUSYes                    -3.971e-01  7.319e-02  -5.426 5.78e-08 ***
## OLDCLAIM                      -3.775e-06  3.328e-06  -1.134 0.256734    
## PARENT1Yes                     4.874e-01  9.567e-02   5.095 3.50e-07 ***
## RED_CARyes                     1.177e-01  7.449e-02   1.580 0.114085    
## SEXM                           2.618e-01  8.873e-02   2.951 0.003171 ** 
## TIF                           -7.158e-02  6.271e-03 -11.414  < 2e-16 ***
## TRAVTIME                       1.661e-02  1.700e-03   9.770  < 2e-16 ***
## URBANICITYHighly Urban/ Urban  3.058e+00  9.939e-02  30.766  < 2e-16 ***
## YOJ                            5.202e-05  8.677e-03   0.006 0.995216    
## AGE_YOUNG                      4.841e-01  2.863e-01   1.691 0.090875 .  
## AGE_OLD                       -5.568e-01  3.268e-01  -1.704 0.088417 .  
## HIGH_RISK                      5.767e-01  5.730e-02  10.065  < 2e-16 ***
## REVOKED_NUM                    5.845e-01  8.215e-02   7.115 1.12e-12 ***
## CAR_INCOME_RATIO               6.267e-05  1.075e-05   5.831 5.51e-09 ***
## HOMEOWNER                     -5.285e-01  7.473e-02  -7.071 1.54e-12 ***
## TOTAL_DRIVERS                  4.181e-01  5.855e-02   7.142 9.21e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 13325.1  on 9611  degrees of freedom
## Residual deviance:  9889.2  on 9576  degrees of freedom
## AIC: 9961.2
## 
## Number of Fisher Scoring iterations: 5

4.3 Model 4b: SMOTE + Add New Features + Transformed Predictors

set.seed(123)
# Model 4B: Transformed dataset

# For reference:
    # LOG_TARGET_AMT = log(data_train$TARGET_AMT + 1),
    # LOG_OLDCLAIM = log(data_train$OLDCLAIM + 1),
    # SQRT_BLUEBOOK = sqrt(data_train$BLUEBOOK),
    # LOG_HOME_VAL = log(data_train$HOME_VAL + 1),
    # LOG_INCOME = log(data_train$INCOME + 1),
    # LOG_TRAVTIME = log(data_train$TRAVTIME + 1),
    # LOG_TIF = log(data_train$TIF + 1),
    # LOG_CAR_AGE = log(data_train$CAR_AGE + 1)


cat("\nModel 4b: Transformed dataset + new features model\n")

## 
## Model 4b: Transformed dataset + new features model

# logit_model4b <- glm(TARGET_FLAG ~ AGE + SQRT_BLUEBOOK + LOG_CAR_AGE + CAR_TYPE + CAR_USE + 
#                      CLM_FREQ + EDUCATION + HOMEKIDS + LOG_HOME_VAL + LOG_INCOME + JOB + 
#                      MSTATUS + MVR_PTS + LOG_OLDCLAIM + PARENT1 + RED_CAR + 
#                      SEX + LOG_TIF + LOG_TRAVTIME + URBANICITY + YOJ +
#                      AGE_YOUNG + AGE_OLD + HIGH_RISK + REVOKED_NUM + CAR_INCOME_RATIO +
#                      HOMEOWNER + TOTAL_DRIVERS,
#                      data=train_transformed_smote, family=binomial)

logit_model4b <- glm(TARGET_FLAG ~ LOG_CAR_AGE + CAR_TYPE + CAR_USE + 
                     EDUCATION + HOMEKIDS + JOB + 
                     MSTATUS + LOG_OLDCLAIM + PARENT1 + RED_CAR + 
                     SEX + LOG_TIF + LOG_TRAVTIME + URBANICITY + YOJ +
                     AGE_YOUNG + AGE_OLD + HIGH_RISK + REVOKED_NUM + CAR_INCOME_RATIO +
                     HOMEOWNER + TOTAL_DRIVERS,
                     data=train_transformed_smote, family=binomial)

summary(logit_model4b)

## 
## Call:
## glm(formula = TARGET_FLAG ~ LOG_CAR_AGE + CAR_TYPE + CAR_USE + 
##     EDUCATION + HOMEKIDS + JOB + MSTATUS + LOG_OLDCLAIM + PARENT1 + 
##     RED_CAR + SEX + LOG_TIF + LOG_TRAVTIME + URBANICITY + YOJ + 
##     AGE_YOUNG + AGE_OLD + HIGH_RISK + REVOKED_NUM + CAR_INCOME_RATIO + 
##     HOMEOWNER + TOTAL_DRIVERS, family = binomial, data = train_transformed_smote)
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -4.882e+00  3.122e-01 -15.634  < 2e-16 ***
## LOG_CAR_AGE                    1.669e-02  4.131e-02   0.404 0.686223    
## CAR_TYPEPanel Truck            1.092e-01  1.222e-01   0.893 0.371677    
## CAR_TYPEPickup                 6.213e-01  8.474e-02   7.332 2.27e-13 ***
## CAR_TYPESports Car             1.324e+00  1.053e-01  12.573  < 2e-16 ***
## CAR_TYPESUV                    1.257e+00  8.614e-02  14.588  < 2e-16 ***
## CAR_TYPEVan                    5.189e-01  1.027e-01   5.053 4.36e-07 ***
## CAR_USEPrivate                -7.789e-01  7.725e-02 -10.083  < 2e-16 ***
## EDUCATIONHigh School           6.005e-01  7.555e-02   7.949 1.88e-15 ***
## EDUCATIONMasters              -4.588e-02  1.155e-01  -0.397 0.691108    
## EDUCATIONPhD                  -1.634e-01  1.399e-01  -1.167 0.243045    
## HOMEKIDS                       1.092e-02  3.074e-02   0.355 0.722500    
## JOBBlue Collar                 3.886e-01  1.553e-01   2.503 0.012327 *  
## JOBClerical                    5.483e-01  1.633e-01   3.358 0.000784 ***
## JOBDoctor                     -5.680e-01  2.157e-01  -2.633 0.008465 ** 
## JOBHome Maker                  1.305e-01  1.762e-01   0.740 0.459112    
## JOBLawyer                      2.128e-01  1.372e-01   1.551 0.120937    
## JOBManager                    -6.707e-01  1.432e-01  -4.684 2.81e-06 ***
## JOBProfessional                2.392e-01  1.487e-01   1.608 0.107736    
## JOBStudent                     3.910e-02  1.848e-01   0.212 0.832387    
## MSTATUSYes                    -3.729e-01  7.357e-02  -5.070 3.99e-07 ***
## LOG_OLDCLAIM                   4.624e-02  7.079e-03   6.532 6.49e-11 ***
## PARENT1Yes                     5.166e-01  9.598e-02   5.383 7.33e-08 ***
## RED_CARyes                     1.236e-01  7.479e-02   1.653 0.098388 .  
## SEXM                           2.521e-01  8.914e-02   2.828 0.004677 ** 
## LOG_TIF                       -4.140e-01  3.629e-02 -11.408  < 2e-16 ***
## LOG_TRAVTIME                   4.965e-01  4.801e-02  10.342  < 2e-16 ***
## URBANICITYHighly Urban/ Urban  2.950e+00  9.964e-02  29.603  < 2e-16 ***
## YOJ                           -1.535e-03  8.698e-03  -0.176 0.859938    
## AGE_YOUNG                      5.483e-01  2.856e-01   1.920 0.054880 .  
## AGE_OLD                       -5.367e-01  3.302e-01  -1.625 0.104143    
## HIGH_RISK                      3.081e-01  6.621e-02   4.654 3.26e-06 ***
## REVOKED_NUM                    5.063e-01  7.376e-02   6.864 6.70e-12 ***
## CAR_INCOME_RATIO               5.969e-05  1.082e-05   5.517 3.45e-08 ***
## HOMEOWNER                     -5.224e-01  7.503e-02  -6.963 3.34e-12 ***
## TOTAL_DRIVERS                  3.975e-01  5.854e-02   6.789 1.13e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 13325.1  on 9611  degrees of freedom
## Residual deviance:  9826.7  on 9576  degrees of freedom
## AIC: 9898.7
## 
## Number of Fisher Scoring iterations: 5

4.4 Multiple Linear regression models for TARGET_AMT using transformation

# Subset to customers who experienced a crash (TARGET_FLAG = 1)
train_crash <- data_transform %>% filter(TARGET_FLAG == 1)
test_crash  <- test_transformed %>% filter(TARGET_FLAG == 1)

cat("Training crash subset:", nrow(train_crash), "observations\n")

## Training crash subset: 2153 observations

cat("Test crash subset:", nrow(test_crash), "observations\n\n")

## Test crash subset: 430 observations

# Model 1 - Basic full mode;

cat("=== LM1: Basic Model ===\n")

## === LM1: Basic Model ===

lm_model1 <- lm(LOG_TARGET_AMT ~ SQRT_BLUEBOOK + LOG_OLDCLAIM + LOG_INCOME +
    CAR_AGE + CAR_TYPE + CAR_USE +
    CLM_FREQ + MVR_PTS,
  data = train_crash
)

summary(lm_model1)

## 
## Call:
## lm(formula = LOG_TARGET_AMT ~ SQRT_BLUEBOOK + LOG_OLDCLAIM + 
##     LOG_INCOME + CAR_AGE + CAR_TYPE + CAR_USE + CLM_FREQ + MVR_PTS, 
##     data = train_crash)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.6866 -0.3966  0.0415  0.4026  3.2465 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          7.9174617  0.0979543  80.828   <2e-16 ***
## SQRT_BLUEBOOK        0.0026128  0.0006429   4.064    5e-05 ***
## LOG_OLDCLAIM         0.0075156  0.0069261   1.085   0.2780    
## LOG_INCOME           0.0018435  0.0051767   0.356   0.7218    
## CAR_AGE             -0.0005098  0.0033577  -0.152   0.8793    
## CAR_TYPEPanel Truck  0.0727282  0.0865756   0.840   0.4010    
## CAR_TYPEPickup       0.0376152  0.0609807   0.617   0.5374    
## CAR_TYPESports Car  -0.0078313  0.0641388  -0.122   0.9028    
## CAR_TYPESUV          0.0111470  0.0536391   0.208   0.8354    
## CAR_TYPEVan          0.0138519  0.0757638   0.183   0.8549    
## CAR_USEPrivate       0.0085784  0.0415176   0.207   0.8363    
## CLM_FREQ            -0.0428954  0.0238388  -1.799   0.0721 .  
## MVR_PTS              0.0155532  0.0072566   2.143   0.0322 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8068 on 2140 degrees of freedom
## Multiple R-squared:  0.01863,    Adjusted R-squared:  0.01313 
## F-statistic: 3.386 on 12 and 2140 DF,  p-value: 6.294e-05

## Model 2: Full Model

cat("\n=== LM2: Full Model ===\n")

## 
## === LM2: Full Model ===

lm_model2 <-  lm(
  LOG_TARGET_AMT ~ .,
  data = train_crash %>% 
    dplyr::select(LOG_TARGET_AMT, SQRT_BLUEBOOK, LOG_OLDCLAIM, LOG_INCOME,
           LOG_HOME_VAL, CAR_AGE, CAR_TYPE, CAR_USE, CLM_FREQ, MVR_PTS,
           AGE_YOUNG, AGE_OLD, HIGH_RISK, CAR_INCOME_RATIO,
           HOMEOWNER, TOTAL_DRIVERS)
)


summary(lm_model2)

## 
## Call:
## lm(formula = LOG_TARGET_AMT ~ ., data = train_crash %>% dplyr::select(LOG_TARGET_AMT, 
##     SQRT_BLUEBOOK, LOG_OLDCLAIM, LOG_INCOME, LOG_HOME_VAL, CAR_AGE, 
##     CAR_TYPE, CAR_USE, CLM_FREQ, MVR_PTS, AGE_YOUNG, AGE_OLD, 
##     HIGH_RISK, CAR_INCOME_RATIO, HOMEOWNER, TOTAL_DRIVERS))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.6959 -0.3910  0.0318  0.4050  3.2553 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          7.868e+00  1.215e-01  64.747  < 2e-16 ***
## SQRT_BLUEBOOK        2.440e-03  6.887e-04   3.544 0.000403 ***
## LOG_OLDCLAIM         7.700e-03  7.080e-03   1.088 0.276902    
## LOG_INCOME           1.079e-02  9.613e-03   1.123 0.261747    
## LOG_HOME_VAL        -4.876e-02  6.159e-02  -0.792 0.428609    
## CAR_AGE             -1.731e-04  3.433e-03  -0.050 0.959800    
## CAR_TYPEPanel Truck  8.541e-02  8.723e-02   0.979 0.327615    
## CAR_TYPEPickup       3.912e-02  6.123e-02   0.639 0.523028    
## CAR_TYPESports Car  -4.549e-03  6.448e-02  -0.071 0.943761    
## CAR_TYPESUV          1.332e-02  5.388e-02   0.247 0.804791    
## CAR_TYPEVan          1.705e-02  7.600e-02   0.224 0.822470    
## CAR_USEPrivate       9.352e-03  4.186e-02   0.223 0.823245    
## CLM_FREQ            -4.499e-02  2.787e-02  -1.614 0.106635    
## MVR_PTS              1.523e-02  1.011e-02   1.506 0.132214    
## AGE_YOUNG           -1.391e-02  1.295e-01  -0.107 0.914437    
## AGE_OLD              8.480e-02  2.180e-01   0.389 0.697288    
## HIGH_RISK            4.654e-03  6.160e-02   0.076 0.939777    
## CAR_INCOME_RATIO     7.534e-06  7.957e-06   0.947 0.343824    
## HOMEOWNER            5.783e-01  7.424e-01   0.779 0.436067    
## TOTAL_DRIVERS       -1.815e-02  2.808e-02  -0.646 0.518175    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8077 on 2133 degrees of freedom
## Multiple R-squared:  0.01961,    Adjusted R-squared:  0.01088 
## F-statistic: 2.246 on 19 and 2133 DF,  p-value: 0.001556

# Model 3: Log-transformed response (recommended for skewed target)

cat("\n=== LM3: Log-transformed ===\n")

## 
## === LM3: Log-transformed ===

lm_model3 <- lm(LOG_TARGET_AMT ~ SQRT_BLUEBOOK + CAR_AGE + CAR_TYPE + 
                LOG_OLDCLAIM + CLM_FREQ + MVR_PTS + LOG_INCOME + CAR_USE,
                data=train_crash)
summary(lm_model3)

## 
## Call:
## lm(formula = LOG_TARGET_AMT ~ SQRT_BLUEBOOK + CAR_AGE + CAR_TYPE + 
##     LOG_OLDCLAIM + CLM_FREQ + MVR_PTS + LOG_INCOME + CAR_USE, 
##     data = train_crash)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.6866 -0.3966  0.0415  0.4026  3.2465 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          7.9174617  0.0979543  80.828   <2e-16 ***
## SQRT_BLUEBOOK        0.0026128  0.0006429   4.064    5e-05 ***
## CAR_AGE             -0.0005098  0.0033577  -0.152   0.8793    
## CAR_TYPEPanel Truck  0.0727282  0.0865756   0.840   0.4010    
## CAR_TYPEPickup       0.0376152  0.0609807   0.617   0.5374    
## CAR_TYPESports Car  -0.0078313  0.0641388  -0.122   0.9028    
## CAR_TYPESUV          0.0111470  0.0536391   0.208   0.8354    
## CAR_TYPEVan          0.0138519  0.0757638   0.183   0.8549    
## LOG_OLDCLAIM         0.0075156  0.0069261   1.085   0.2780    
## CLM_FREQ            -0.0428954  0.0238388  -1.799   0.0721 .  
## MVR_PTS              0.0155532  0.0072566   2.143   0.0322 *  
## LOG_INCOME           0.0018435  0.0051767   0.356   0.7218    
## CAR_USEPrivate       0.0085784  0.0415176   0.207   0.8363    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8068 on 2140 degrees of freedom
## Multiple R-squared:  0.01863,    Adjusted R-squared:  0.01313 
## F-statistic: 3.386 on 12 and 2140 DF,  p-value: 6.294e-05

# Model 4: Log + New Features

cat("\n=== LM4: Log + New Features ===\n")

## 
## === LM4: Log + New Features ===

lm_model4 <- lm(
  LOG_TARGET_AMT ~ SQRT_BLUEBOOK + LOG_OLDCLAIM + LOG_INCOME + LOG_HOME_VAL +
    CAR_AGE + CAR_TYPE + CAR_USE +
    CLM_FREQ + MVR_PTS +
    AGE_YOUNG + AGE_OLD + HIGH_RISK + CAR_INCOME_RATIO +
    HOMEOWNER + TOTAL_DRIVERS,
  data = train_crash
)

summary(lm_model4)

## 
## Call:
## lm(formula = LOG_TARGET_AMT ~ SQRT_BLUEBOOK + LOG_OLDCLAIM + 
##     LOG_INCOME + LOG_HOME_VAL + CAR_AGE + CAR_TYPE + CAR_USE + 
##     CLM_FREQ + MVR_PTS + AGE_YOUNG + AGE_OLD + HIGH_RISK + CAR_INCOME_RATIO + 
##     HOMEOWNER + TOTAL_DRIVERS, data = train_crash)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.6959 -0.3910  0.0318  0.4050  3.2553 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          7.868e+00  1.215e-01  64.747  < 2e-16 ***
## SQRT_BLUEBOOK        2.440e-03  6.887e-04   3.544 0.000403 ***
## LOG_OLDCLAIM         7.700e-03  7.080e-03   1.088 0.276902    
## LOG_INCOME           1.079e-02  9.613e-03   1.123 0.261747    
## LOG_HOME_VAL        -4.876e-02  6.159e-02  -0.792 0.428609    
## CAR_AGE             -1.731e-04  3.433e-03  -0.050 0.959800    
## CAR_TYPEPanel Truck  8.541e-02  8.723e-02   0.979 0.327615    
## CAR_TYPEPickup       3.912e-02  6.123e-02   0.639 0.523028    
## CAR_TYPESports Car  -4.549e-03  6.448e-02  -0.071 0.943761    
## CAR_TYPESUV          1.332e-02  5.388e-02   0.247 0.804791    
## CAR_TYPEVan          1.705e-02  7.600e-02   0.224 0.822470    
## CAR_USEPrivate       9.352e-03  4.186e-02   0.223 0.823245    
## CLM_FREQ            -4.499e-02  2.787e-02  -1.614 0.106635    
## MVR_PTS              1.523e-02  1.011e-02   1.506 0.132214    
## AGE_YOUNG           -1.391e-02  1.295e-01  -0.107 0.914437    
## AGE_OLD              8.480e-02  2.180e-01   0.389 0.697288    
## HIGH_RISK            4.654e-03  6.160e-02   0.076 0.939777    
## CAR_INCOME_RATIO     7.534e-06  7.957e-06   0.947 0.343824    
## HOMEOWNER            5.783e-01  7.424e-01   0.779 0.436067    
## TOTAL_DRIVERS       -1.815e-02  2.808e-02  -0.646 0.518175    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8077 on 2133 degrees of freedom
## Multiple R-squared:  0.01961,    Adjusted R-squared:  0.01088 
## F-statistic: 2.246 on 19 and 2133 DF,  p-value: 0.001556

# Model 5: Stepwise AIC

cat("\n=== LM5: Stepwise AIC Model ===\n")

## 
## === LM5: Stepwise AIC Model ===

lm_model5 <- stepAIC(
  lm(
    LOG_TARGET_AMT ~ SQRT_BLUEBOOK + LOG_OLDCLAIM + LOG_INCOME + LOG_HOME_VAL +
      CAR_AGE + CAR_TYPE + CAR_USE +
      CLM_FREQ + MVR_PTS +
      AGE_YOUNG + AGE_OLD + HIGH_RISK + CAR_INCOME_RATIO +
      HOMEOWNER + TOTAL_DRIVERS,
    data = train_crash
  ),
  direction = "both",
  trace = FALSE
)

summary(lm_model5)

## 
## Call:
## lm(formula = LOG_TARGET_AMT ~ SQRT_BLUEBOOK + CLM_FREQ + MVR_PTS, 
##     data = train_crash)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.6682 -0.3975  0.0354  0.3994  3.2143 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    7.9361024  0.0640206 123.962  < 2e-16 ***
## SQRT_BLUEBOOK  0.0028434  0.0005023   5.660 1.71e-08 ***
## CLM_FREQ      -0.0226374  0.0145751  -1.553   0.1205    
## MVR_PTS        0.0172693  0.0070566   2.447   0.0145 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8055 on 2149 degrees of freedom
## Multiple R-squared:  0.01749,    Adjusted R-squared:  0.01612 
## F-statistic: 12.75 on 3 and 2149 DF,  p-value: 2.928e-08

# Model 6: Model with interactions

cat("\n=== LM6: Interaction Model ===\n")

## 
## === LM6: Interaction Model ===

lm_model6 <- lm(LOG_TARGET_AMT ~ SQRT_BLUEBOOK + LOG_OLDCLAIM + LOG_INCOME +
    CAR_AGE + CAR_TYPE + CAR_USE +
    CLM_FREQ * AGE +            # interaction
    MVR_PTS * CAR_TYPE,         # interaction
  data = train_crash
)

summary(lm_model6)

## 
## Call:
## lm(formula = LOG_TARGET_AMT ~ SQRT_BLUEBOOK + LOG_OLDCLAIM + 
##     LOG_INCOME + CAR_AGE + CAR_TYPE + CAR_USE + CLM_FREQ * AGE + 
##     MVR_PTS * CAR_TYPE, data = train_crash)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.6807 -0.3946  0.0384  0.3959  3.2997 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  7.940e+00  1.463e-01  54.277  < 2e-16 ***
## SQRT_BLUEBOOK                2.571e-03  6.511e-04   3.949 8.12e-05 ***
## LOG_OLDCLAIM                 7.325e-03  6.942e-03   1.055    0.291    
## LOG_INCOME                   1.582e-03  5.190e-03   0.305    0.760    
## CAR_AGE                     -4.723e-04  3.386e-03  -0.139    0.889    
## CAR_TYPEPanel Truck          1.828e-02  1.108e-01   0.165    0.869    
## CAR_TYPEPickup               1.590e-02  8.292e-02   0.192    0.848    
## CAR_TYPESports Car           2.206e-02  8.779e-02   0.251    0.802    
## CAR_TYPESUV                 -3.397e-02  7.333e-02  -0.463    0.643    
## CAR_TYPEVan                  2.924e-02  1.011e-01   0.289    0.772    
## CAR_USEPrivate               4.413e-03  4.166e-02   0.106    0.916    
## CLM_FREQ                    -5.612e-02  6.777e-02  -0.828    0.408    
## AGE                          9.172e-05  2.624e-03   0.035    0.972    
## MVR_PTS                      8.381e-03  1.815e-02   0.462    0.644    
## CLM_FREQ:AGE                 3.279e-04  1.435e-03   0.229    0.819    
## CAR_TYPEPanel Truck:MVR_PTS  2.244e-02  2.927e-02   0.767    0.443    
## CAR_TYPEPickup:MVR_PTS       8.189e-03  2.306e-02   0.355    0.723    
## CAR_TYPESports Car:MVR_PTS  -1.124e-02  2.495e-02  -0.450    0.653    
## CAR_TYPESUV:MVR_PTS          1.854e-02  2.167e-02   0.856    0.392    
## CAR_TYPEVan:MVR_PTS         -6.570e-03  2.887e-02  -0.228    0.820    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8075 on 2133 degrees of freedom
## Multiple R-squared:  0.02006,    Adjusted R-squared:  0.01133 
## F-statistic: 2.298 on 19 and 2133 DF,  p-value: 0.001146

# Model 7: Polynomial model 

cat("\n=== LM7: Polynomial Model ===\n")

## 
## === LM7: Polynomial Model ===

lm_model7 <- lm(LOG_TARGET_AMT ~ SQRT_BLUEBOOK + I(SQRT_BLUEBOOK^2) +
    CAR_AGE + I(CAR_AGE^2) +
    LOG_INCOME + I(LOG_INCOME^2) +
    CLM_FREQ + MVR_PTS +
    CAR_TYPE + CAR_USE,
  data = train_crash
)

summary(lm_model7)

## 
## Call:
## lm(formula = LOG_TARGET_AMT ~ SQRT_BLUEBOOK + I(SQRT_BLUEBOOK^2) + 
##     CAR_AGE + I(CAR_AGE^2) + LOG_INCOME + I(LOG_INCOME^2) + CLM_FREQ + 
##     MVR_PTS + CAR_TYPE + CAR_USE, data = train_crash)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.7004 -0.3933  0.0378  0.4019  3.2084 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          7.464e+00  1.734e-01  43.039  < 2e-16 ***
## SQRT_BLUEBOOK        1.114e-02  2.808e-03   3.969 7.45e-05 ***
## I(SQRT_BLUEBOOK^2)  -3.771e-05  1.218e-05  -3.095  0.00199 ** 
## CAR_AGE              5.895e-03  9.765e-03   0.604  0.54611    
## I(CAR_AGE^2)        -3.110e-04  5.377e-04  -0.578  0.56304    
## LOG_INCOME           1.497e-02  2.952e-02   0.507  0.61204    
## I(LOG_INCOME^2)     -1.272e-03  2.638e-03  -0.482  0.62981    
## CLM_FREQ            -2.350e-02  1.463e-02  -1.606  0.10833    
## MVR_PTS              1.733e-02  7.093e-03   2.443  0.01465 *  
## CAR_TYPEPanel Truck  1.809e-01  9.315e-02   1.942  0.05228 .  
## CAR_TYPEPickup       3.522e-02  6.098e-02   0.578  0.56360    
## CAR_TYPESports Car   2.610e-03  6.409e-02   0.041  0.96752    
## CAR_TYPESUV          2.178e-03  5.363e-02   0.041  0.96761    
## CAR_TYPEVan          2.276e-02  7.578e-02   0.300  0.76398    
## CAR_USEPrivate       1.277e-02  4.188e-02   0.305  0.76036    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8054 on 2138 degrees of freedom
## Multiple R-squared:  0.02286,    Adjusted R-squared:  0.01646 
## F-statistic: 3.573 on 14 and 2138 DF,  p-value: 7.161e-06

4.6 Check for multicollinearity

We know we have some strongly correlated features, so let’s address this potential issue by looking at the VIF for the models and removing predictors.

For the logistic regression models, luckily no predictor has a VIF > 5 (for Df > 1 (categorical), must look at GVIF^(1/(2*Df)) value), so we don’t need to remove any predictors.

# Logistic Regression Models

cat("=== VIF for Logistic Regression Model 1 (Original Dataset) ===\n")

## === VIF for Logistic Regression Model 1 (Original Dataset) ===

car::vif(logit_model1)

##                 GVIF Df GVIF^(1/(2*Df))
## AGE         1.453486  1        1.205606
## BLUEBOOK    2.197759  1        1.482484
## CAR_AGE     2.009228  1        1.417472
## CAR_TYPE    6.261275  5        1.201341
## CAR_USE     2.211000  1        1.486943
## CLM_FREQ    1.473096  1        1.213712
## EDUCATION   9.194405  3        1.447396
## HOMEKIDS    2.185889  1        1.478475
## HOME_VAL    1.879334  1        1.370888
## INCOME      2.512717  1        1.585155
## JOB        22.514875  8        1.214869
## KIDSDRIV    1.334501  1        1.155206
## MSTATUS     2.073668  1        1.440024
## MVR_PTS     1.171888  1        1.082538
## OLDCLAIM    1.638979  1        1.280226
## PARENT1     1.948642  1        1.395938
## RED_CAR     1.826195  1        1.351368
## REVOKED     1.306134  1        1.142862
## SEX         3.709905  1        1.926111
## TIF         1.009130  1        1.004555
## TRAVTIME    1.042725  1        1.021139
## URBANICITY  1.148561  1        1.071709
## YOJ         1.446233  1        1.202594

cat("=== VIF for Logistic Regression Model 1B (SMOTE + Standardized) ===\n")

## === VIF for Logistic Regression Model 1B (SMOTE + Standardized) ===

car::vif(logit_model1b)

##                 GVIF Df GVIF^(1/(2*Df))
## AGE         1.494277  1        1.222406
## BLUEBOOK    2.152023  1        1.466977
## CAR_AGE     2.114405  1        1.454099
## CAR_TYPE    6.331512  5        1.202682
## CAR_USE     2.415683  1        1.554247
## CLM_FREQ    1.494993  1        1.222699
## EDUCATION  11.379517  3        1.499756
## HOMEKIDS    2.175773  1        1.475050
## HOME_VAL    1.999326  1        1.413975
## INCOME      2.672750  1        1.634855
## JOB        30.183605  8        1.237330
## KIDSDRIV    1.343991  1        1.159306
## MSTATUS     2.044542  1        1.429875
## MVR_PTS     1.175589  1        1.084246
## OLDCLAIM    1.630242  1        1.276809
## PARENT1     1.863041  1        1.364933
## RED_CAR     1.887419  1        1.373834
## REVOKED     1.275821  1        1.129522
## SEX         3.719656  1        1.928641
## TIF         1.014645  1        1.007296
## TRAVTIME    1.042611  1        1.021083
## URBANICITY  1.160051  1        1.077057
## YOJ         1.500115  1        1.224792

cat("=== VIF for Logistic Regression Model 2 (Stepwise - Model 1) ===\n")

## === VIF for Logistic Regression Model 2 (Stepwise - Model 1) ===

car::vif(logit_model2)

##                 GVIF Df GVIF^(1/(2*Df))
## BLUEBOOK    1.770719  1        1.330684
## CAR_TYPE    2.493147  5        1.095657
## CAR_USE     2.205527  1        1.485102
## CLM_FREQ    1.472481  1        1.213458
## EDUCATION   6.961135  3        1.381805
## HOME_VAL    1.861225  1        1.364267
## INCOME      2.499022  1        1.580830
## JOB        20.999802  8        1.209591
## KIDSDRIV    1.088701  1        1.043409
## MSTATUS     1.914384  1        1.383613
## MVR_PTS     1.170633  1        1.081958
## OLDCLAIM    1.637648  1        1.279706
## PARENT1     1.430834  1        1.196175
## REVOKED     1.304385  1        1.142097
## TIF         1.008513  1        1.004247
## TRAVTIME    1.041268  1        1.020426
## URBANICITY  1.148127  1        1.071507
## YOJ         1.365754  1        1.168655

cat("=== VIF for Logistic Regression Model 2B (Stepwise - Model 1B) ===\n")

## === VIF for Logistic Regression Model 2B (Stepwise - Model 1B) ===

car::vif(logit_model2b)

##                 GVIF Df GVIF^(1/(2*Df))
## BLUEBOOK    1.864609  1        1.365507
## CAR_TYPE    3.478982  5        1.132779
## CAR_USE     2.412900  1        1.553351
## CLM_FREQ    1.493600  1        1.222129
## EDUCATION   8.726510  3        1.434851
## HOME_VAL    1.984482  1        1.408716
## INCOME      2.659286  1        1.630732
## JOB        28.259860  8        1.232248
## KIDSDRIV    1.090895  1        1.044459
## MSTATUS     1.914517  1        1.383661
## MVR_PTS     1.174337  1        1.083668
## OLDCLAIM    1.629091  1        1.276358
## PARENT1     1.407267  1        1.186283
## RED_CAR     1.488978  1        1.220237
## REVOKED     1.274468  1        1.128923
## TIF         1.013651  1        1.006802
## TRAVTIME    1.042044  1        1.020806
## URBANICITY  1.159532  1        1.076816
## YOJ         1.410669  1        1.187716

cat("=== VIF for Logistic Regression Model 3 (SMOTE Transformed) ===\n")

## === VIF for Logistic Regression Model 3 (SMOTE Transformed) ===

car::vif(logit_model3)

##                    GVIF Df GVIF^(1/(2*Df))
## AGE            1.505423  1        1.226957
## SQRT_BLUEBOOK  1.871071  1        1.367871
## LOG_CAR_AGE    1.699104  1        1.303497
## CAR_TYPE       5.635967  5        1.188767
## CAR_USE        2.369927  1        1.539457
## CLM_FREQ       3.673577  1        1.916658
## EDUCATION      8.610431  3        1.431652
## HOMEKIDS       2.173046  1        1.474126
## LOG_HOME_VAL   1.950121  1        1.396467
## LOG_INCOME     3.623382  1        1.903518
## JOB           34.565921  8        1.247859
## KIDSDRIV       1.355306  1        1.164176
## MSTATUS        2.185008  1        1.478177
## MVR_PTS        1.252931  1        1.119344
## LOG_OLDCLAIM   3.901095  1        1.975119
## PARENT1        1.789260  1        1.337632
## RED_CAR        1.881666  1        1.371738
## REVOKED        1.032469  1        1.016105
## SEX            3.561483  1        1.887189
## LOG_TIF        1.017313  1        1.008619
## LOG_TRAVTIME   1.035052  1        1.017375
## URBANICITY     1.160375  1        1.077207
## YOJ            2.333283  1        1.527509

cat("=== VIF for Logistic Regression Model 4 (SMOTE New Features) ===\n")

## === VIF for Logistic Regression Model 4 (SMOTE New Features) ===

car::vif(logit_model4)

##                       GVIF Df GVIF^(1/(2*Df))
## CAR_AGE           2.025268  1        1.423119
## CAR_TYPE          3.931659  5        1.146721
## CAR_USE           2.347750  1        1.532237
## EDUCATION         8.865713  3        1.438640
## HOMEKIDS          1.775343  1        1.332420
## JOB              26.135248  8        1.226243
## MSTATUS           2.155324  1        1.468102
## OLDCLAIM          1.405127  1        1.185381
## PARENT1           1.762708  1        1.327670
## RED_CAR           1.903745  1        1.379763
## SEX               3.224026  1        1.795557
## TIF               1.016021  1        1.007979
## TRAVTIME          1.042285  1        1.020923
## URBANICITY        1.162497  1        1.078192
## YOJ               1.840105  1        1.356505
## AGE_YOUNG         1.037224  1        1.018442
## AGE_OLD           1.024686  1        1.012268
## HIGH_RISK         1.143681  1        1.069430
## REVOKED_NUM       1.279408  1        1.131109
## CAR_INCOME_RATIO  1.894142  1        1.376278
## HOMEOWNER         1.944460  1        1.394439
## TOTAL_DRIVERS     1.308610  1        1.143945

cat("=== VIF for Logistic Regression Model 4B (SMOTE Transformed + New Features) ===\n")

## === VIF for Logistic Regression Model 4B (SMOTE Transformed + New Features) ===

car::vif(logit_model4b)

##                       GVIF Df GVIF^(1/(2*Df))
## LOG_CAR_AGE       1.701633  1        1.304467
## CAR_TYPE          3.925610  5        1.146544
## CAR_USE           2.349883  1        1.532933
## EDUCATION         8.488464  3        1.428252
## HOMEKIDS          1.784563  1        1.335875
## JOB              26.062550  8        1.226030
## MSTATUS           2.161869  1        1.470330
## LOG_OLDCLAIM      1.540733  1        1.241263
## PARENT1           1.772840  1        1.331480
## RED_CAR           1.906170  1        1.380641
## SEX               3.229584  1        1.797104
## LOG_TIF           1.016545  1        1.008239
## LOG_TRAVTIME      1.033864  1        1.016791
## URBANICITY        1.158781  1        1.076467
## YOJ               1.840153  1        1.356522
## AGE_YOUNG         1.036958  1        1.018311
## AGE_OLD           1.024218  1        1.012037
## HIGH_RISK         1.520993  1        1.233285
## REVOKED_NUM       1.017083  1        1.008505
## CAR_INCOME_RATIO  1.894126  1        1.376272
## HOMEOWNER         1.949532  1        1.396256
## TOTAL_DRIVERS     1.313436  1        1.146052

# Multiple linear models

cat("=== LM1: Basic Model ===\n")

## === LM1: Basic Model ===

print(car::vif(lm_model1))

##                   GVIF Df GVIF^(1/(2*Df))
## SQRT_BLUEBOOK 1.634619  1        1.278522
## LOG_OLDCLAIM  3.055449  1        1.747984
## LOG_INCOME    1.101921  1        1.049724
## CAR_AGE       1.065863  1        1.032406
## CAR_TYPE      2.059942  5        1.074943
## CAR_USE       1.424441  1        1.193499
## CLM_FREQ      2.928292  1        1.711225
## MVR_PTS       1.158223  1        1.076208

cat("\n=== LM2: Full Model ===\n")

## 
## === LM2: Full Model ===

print(car::vif(lm_model2))

##                        GVIF Df GVIF^(1/(2*Df))
## SQRT_BLUEBOOK      1.871192  1        1.367915
## LOG_OLDCLAIM       3.185512  1        1.784800
## LOG_INCOME         3.791385  1        1.947148
## LOG_HOME_VAL     437.912390  1       20.926356
## CAR_AGE            1.111658  1        1.054352
## CAR_TYPE           2.129760  5        1.078532
## CAR_USE            1.444762  1        1.201982
## CLM_FREQ           3.993058  1        1.998264
## MVR_PTS            2.242908  1        1.497634
## AGE_YOUNG          1.033258  1        1.016493
## AGE_OLD            1.013167  1        1.006562
## HIGH_RISK          3.033894  1        1.741808
## CAR_INCOME_RATIO   3.228129  1        1.796699
## HOMEOWNER        433.904578  1       20.830376
## TOTAL_DRIVERS      1.022413  1        1.011144

cat("\n=== LM3: Log-transformed ===\n")

## 
## === LM3: Log-transformed ===

print(car::vif(lm_model3))

##                   GVIF Df GVIF^(1/(2*Df))
## SQRT_BLUEBOOK 1.634619  1        1.278522
## CAR_AGE       1.065863  1        1.032406
## CAR_TYPE      2.059942  5        1.074943
## LOG_OLDCLAIM  3.055449  1        1.747984
## CLM_FREQ      2.928292  1        1.711225
## MVR_PTS       1.158223  1        1.076208
## LOG_INCOME    1.101921  1        1.049724
## CAR_USE       1.424441  1        1.193499

cat("\n=== LM4: Log + New Features ===\n")

## 
## === LM4: Log + New Features ===

print(car::vif(lm_model4))

##                        GVIF Df GVIF^(1/(2*Df))
## SQRT_BLUEBOOK      1.871192  1        1.367915
## LOG_OLDCLAIM       3.185512  1        1.784800
## LOG_INCOME         3.791385  1        1.947148
## LOG_HOME_VAL     437.912390  1       20.926356
## CAR_AGE            1.111658  1        1.054352
## CAR_TYPE           2.129760  5        1.078532
## CAR_USE            1.444762  1        1.201982
## CLM_FREQ           3.993058  1        1.998264
## MVR_PTS            2.242908  1        1.497634
## AGE_YOUNG          1.033258  1        1.016493
## AGE_OLD            1.013167  1        1.006562
## HIGH_RISK          3.033894  1        1.741808
## CAR_INCOME_RATIO   3.228129  1        1.796699
## HOMEOWNER        433.904578  1       20.830376
## TOTAL_DRIVERS      1.022413  1        1.011144

cat("\n=== LM5: Stepwise AIC Model ===\n")

## 
## === LM5: Stepwise AIC Model ===

print(car::vif(lm_model5))

## SQRT_BLUEBOOK      CLM_FREQ       MVR_PTS 
##      1.000878      1.097958      1.098598

cat("\n=== LM6: Interaction Model ===\n")

## 
## === LM6: Interaction Model ===

print(car::vif(lm_model6))

## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif

##                        GVIF Df GVIF^(1/(2*Df))
## SQRT_BLUEBOOK      1.673248  1        1.293541
## LOG_OLDCLAIM       3.063823  1        1.750378
## LOG_INCOME         1.105354  1        1.051358
## CAR_AGE            1.081960  1        1.040173
## CAR_TYPE          39.910583  5        1.445802
## CAR_USE            1.431287  1        1.196364
## CLM_FREQ          23.621964  1        4.860243
## AGE                2.073587  1        1.439995
## MVR_PTS            7.231164  1        2.689082
## CLM_FREQ:AGE      22.401296  1        4.733001
## CAR_TYPE:MVR_PTS 114.165821  5        1.606030

cat("\n=== LM7: Polynomial Model ===\n")

## 
## === LM7: Polynomial Model ===

print(car::vif(lm_model7))

##                         GVIF Df GVIF^(1/(2*Df))
## SQRT_BLUEBOOK      31.276297  1        5.592522
## I(SQRT_BLUEBOOK^2) 33.928016  1        5.824776
## CAR_AGE             9.045596  1        3.007590
## I(CAR_AGE^2)        9.144451  1        3.023979
## LOG_INCOME         35.955761  1        5.996312
## I(LOG_INCOME^2)    37.647558  1        6.135761
## CLM_FREQ            1.106749  1        1.052022
## MVR_PTS             1.110442  1        1.053775
## CAR_TYPE            2.555535  5        1.098369
## CAR_USE             1.454060  1        1.205844

4.7 Interpretation of coefficients

# Logistic Regression Models

# Interpretation of coefficients (Model 1)
cat("\n=== Coefficient Interpretation (Model 1) ===\n")

## 
## === Coefficient Interpretation (Model 1) ===

coef_exp <- exp(coef(logit_model1))
print(round(coef_exp, 4))

##                   (Intercept)                           AGE 
##                        0.0394                        0.9965 
##                      BLUEBOOK                       CAR_AGE 
##                        1.0000                        0.9962 
##           CAR_TYPEPanel Truck                CAR_TYPEPickup 
##                        1.7226                        1.7302 
##            CAR_TYPESports Car                   CAR_TYPESUV 
##                        2.9039                        2.2008 
##                   CAR_TYPEVan                CAR_USEPrivate 
##                        2.0309                        0.4622 
##                      CLM_FREQ          EDUCATIONHigh School 
##                        1.1878                        1.4871 
##              EDUCATIONMasters                  EDUCATIONPhD 
##                        1.0665                        1.2188 
##                      HOMEKIDS                      HOME_VAL 
##                        1.0260                        1.0000 
##                        INCOME                JOBBlue Collar 
##                        1.0000                        1.4608 
##                   JOBClerical                     JOBDoctor 
##                        1.5903                        0.7596 
##                 JOBHome Maker                     JOBLawyer 
##                        1.3574                        1.1872 
##                    JOBManager               JOBProfessional 
##                        0.6228                        1.2920 
##                    JOBStudent                      KIDSDRIV 
##                        1.3250                        1.4848 
##                    MSTATUSYes                       MVR_PTS 
##                        0.6109                        1.1187 
##                      OLDCLAIM                    PARENT1Yes 
##                        1.0000                        1.6118 
##                    RED_CARyes                    REVOKEDYes 
##                        0.9830                        2.2045 
##                          SEXM                           TIF 
##                        1.0706                        0.9452 
##                      TRAVTIME URBANICITYHighly Urban/ Urban 
##                        1.0150                       11.5789 
##                           YOJ 
##                        0.9840

# Interpretation of coefficients (Model 1B)
cat("\n=== Coefficient Interpretation (Model 1B) ===\n")

## 
## === Coefficient Interpretation (Model 1B) ===

coef_exp <- exp(coef(logit_model1b))
print(round(coef_exp, 4))

##                   (Intercept)                           AGE 
##                        0.0470                        0.9825 
##                      BLUEBOOK                       CAR_AGE 
##                        0.8342                        0.9995 
##           CAR_TYPEPanel Truck                CAR_TYPEPickup 
##                        1.5945                        1.6982 
##            CAR_TYPESports Car                   CAR_TYPESUV 
##                        3.1507                        2.6675 
##                   CAR_TYPEVan                CAR_USEPrivate 
##                        1.9036                        0.4367 
##                      CLM_FREQ          EDUCATIONHigh School 
##                        1.1891                        1.6429 
##              EDUCATIONMasters                  EDUCATIONPhD 
##                        0.9629                        0.9347 
##                      HOMEKIDS                      HOME_VAL 
##                        1.0032                        0.8207 
##                        INCOME                JOBBlue Collar 
##                        0.9019                        1.1760 
##                   JOBClerical                     JOBDoctor 
##                        1.3052                        0.7839 
##                 JOBHome Maker                     JOBLawyer 
##                        1.0186                        1.1723 
##                    JOBManager               JOBProfessional 
##                        0.4460                        1.0718 
##                    JOBStudent                      KIDSDRIV 
##                        0.9642                        1.2282 
##                    MSTATUSYes                       MVR_PTS 
##                        0.6530                        1.2554 
##                      OLDCLAIM                    PARENT1Yes 
##                        0.9416                        1.7310 
##                    RED_CARyes                    REVOKEDYes 
##                        1.0937                        1.4882 
##                          SEXM                           TIF 
##                        1.0719                        0.7293 
##                      TRAVTIME URBANICITYHighly Urban/ Urban 
##                        1.2917                       19.6731 
##                           YOJ 
##                        0.9329

# Interpretation of coefficients (Model 2)
cat("\n=== Coefficient Interpretation (Model 2) ===\n")

## 
## === Coefficient Interpretation (Model 2) ===

coef_exp <- exp(coef(logit_model2))
print(round(coef_exp, 4))

##                   (Intercept)                      BLUEBOOK 
##                        0.0346                        1.0000 
##           CAR_TYPEPanel Truck                CAR_TYPEPickup 
##                        1.7906                        1.7289 
##            CAR_TYPESports Car                   CAR_TYPESUV 
##                        2.7798                        2.1177 
##                   CAR_TYPEVan                CAR_USEPrivate 
##                        2.0789                        0.4645 
##                      CLM_FREQ          EDUCATIONHigh School 
##                        1.1878                        1.5142 
##              EDUCATIONMasters                  EDUCATIONPhD 
##                        1.0392                        1.1798 
##                      HOME_VAL                        INCOME 
##                        1.0000                        1.0000 
##                JOBBlue Collar                   JOBClerical 
##                        1.4590                        1.5975 
##                     JOBDoctor                 JOBHome Maker 
##                        0.7506                        1.3379 
##                     JOBLawyer                    JOBManager 
##                        1.1707                        0.6138 
##               JOBProfessional                    JOBStudent 
##                        1.2770                        1.3381 
##                      KIDSDRIV                    MSTATUSYes 
##                        1.5094                        0.6254 
##                       MVR_PTS                      OLDCLAIM 
##                        1.1192                        1.0000 
##                    PARENT1Yes                    REVOKEDYes 
##                        1.7218                        2.2134 
##                           TIF                      TRAVTIME 
##                        0.9453                        1.0149 
## URBANICITYHighly Urban/ Urban                           YOJ 
##                       11.6050                        0.9844

# Interpretation of coefficients (Model 2B)
cat("\n=== Coefficient Interpretation (Model 2B) ===\n")

## 
## === Coefficient Interpretation (Model 2B) ===

coef_exp <- exp(coef(logit_model2b))
print(round(coef_exp, 4))

##                   (Intercept)                      BLUEBOOK 
##                        0.0484                        0.8247 
##           CAR_TYPEPanel Truck                CAR_TYPEPickup 
##                        1.6472                        1.6960 
##            CAR_TYPESports Car                   CAR_TYPESUV 
##                        3.0215                        2.5673 
##                   CAR_TYPEVan                CAR_USEPrivate 
##                        1.9376                        0.4372 
##                      CLM_FREQ          EDUCATIONHigh School 
##                        1.1896                        1.6447 
##              EDUCATIONMasters                  EDUCATIONPhD 
##                        0.9614                        0.9275 
##                      HOME_VAL                        INCOME 
##                        0.8199                        0.9026 
##                JOBBlue Collar                   JOBClerical 
##                        1.1793                        1.3121 
##                     JOBDoctor                 JOBHome Maker 
##                        0.7826                        1.0068 
##                     JOBLawyer                    JOBManager 
##                        1.1671                        0.4442 
##               JOBProfessional                    JOBStudent 
##                        1.0699                        0.9666 
##                      KIDSDRIV                    MSTATUSYes 
##                        1.2289                        0.6561 
##                       MVR_PTS                      OLDCLAIM 
##                        1.2553                        0.9410 
##                    PARENT1Yes                    RED_CARyes 
##                        1.7625                        1.1197 
##                    REVOKEDYes                           TIF 
##                        1.4902                        0.7294 
##                      TRAVTIME URBANICITYHighly Urban/ Urban 
##                        1.2914                       19.7026 
##                           YOJ 
##                        0.9315

# Interpretation of coefficients (Model 3)
cat("\n=== Coefficient Interpretation (Model 3) ===\n")

## 
## === Coefficient Interpretation (Model 3) ===

coef_exp <- exp(coef(logit_model3))
print(round(coef_exp, 4))

##                   (Intercept)                           AGE 
##                        0.1276                        0.9902 
##                 SQRT_BLUEBOOK                   LOG_CAR_AGE 
##                        0.9941                        1.0209 
##           CAR_TYPEPanel Truck                CAR_TYPEPickup 
##                        1.5895                        1.7016 
##            CAR_TYPESports Car                   CAR_TYPESUV 
##                        2.9086                        2.8224 
##                   CAR_TYPEVan                CAR_USEPrivate 
##                        1.9970                        0.4435 
##                      CLM_FREQ          EDUCATIONHigh School 
##                        1.0431                        1.7088 
##              EDUCATIONMasters                  EDUCATIONPhD 
##                        0.9796                        0.9341 
##                      HOMEKIDS                  LOG_HOME_VAL 
##                        0.9475                        0.9592 
##                    LOG_INCOME                JOBBlue Collar 
##                        0.8748                        1.4174 
##                   JOBClerical                     JOBDoctor 
##                        1.5837                        0.6259 
##                 JOBHome Maker                     JOBLawyer 
##                        0.7554                        1.2744 
##                    JOBManager               JOBProfessional 
##                        0.5323                        1.2691 
##                    JOBStudent                      KIDSDRIV 
##                        0.6813                        1.5357 
##                    MSTATUSYes                       MVR_PTS 
##                        0.6641                        1.1040 
##                  LOG_OLDCLAIM                    PARENT1Yes 
##                        1.0330                        1.6364 
##                    RED_CARyes                    REVOKEDYes 
##                        1.1008                        1.5152 
##                          SEXM                       LOG_TIF 
##                        1.0510                        0.6614 
##                  LOG_TRAVTIME URBANICITYHighly Urban/ Urban 
##                        1.6558                       18.8970 
##                           YOJ 
##                        1.0319

# Interpretation of coefficients (Model 4)
cat("\n=== Coefficient Interpretation (Model 4) ===\n")

## 
## === Coefficient Interpretation (Model 4) ===

coef_exp <- exp(coef(logit_model4))
print(round(coef_exp, 4))

##                   (Intercept)                       CAR_AGE 
##                        0.0179                        1.0043 
##           CAR_TYPEPanel Truck                CAR_TYPEPickup 
##                        1.1343                        1.8822 
##            CAR_TYPESports Car                   CAR_TYPESUV 
##                        3.8492                        3.6183 
##                   CAR_TYPEVan                CAR_USEPrivate 
##                        1.6982                        0.4512 
##          EDUCATIONHigh School              EDUCATIONMasters 
##                        1.8124                        0.9335 
##                  EDUCATIONPhD                      HOMEKIDS 
##                        0.8079                        1.0120 
##                JOBBlue Collar                   JOBClerical 
##                        1.4449                        1.7165 
##                     JOBDoctor                 JOBHome Maker 
##                        0.5574                        1.1439 
##                     JOBLawyer                    JOBManager 
##                        1.2097                        0.5044 
##               JOBProfessional                    JOBStudent 
##                        1.2541                        1.0241 
##                    MSTATUSYes                      OLDCLAIM 
##                        0.6723                        1.0000 
##                    PARENT1Yes                    RED_CARyes 
##                        1.6281                        1.1249 
##                          SEXM                           TIF 
##                        1.2993                        0.9309 
##                      TRAVTIME URBANICITYHighly Urban/ Urban 
##                        1.0168                       21.2828 
##                           YOJ                     AGE_YOUNG 
##                        1.0001                        1.6227 
##                       AGE_OLD                     HIGH_RISK 
##                        0.5730                        1.7801 
##                   REVOKED_NUM              CAR_INCOME_RATIO 
##                        1.7942                        1.0001 
##                     HOMEOWNER                 TOTAL_DRIVERS 
##                        0.5895                        1.5191

# Interpretation of coefficients (Model 4B)
cat("\n=== Coefficient Interpretation (Model 4B) ===\n")

## 
## === Coefficient Interpretation (Model 4B) ===

coef_exp <- exp(coef(logit_model4b))
print(round(coef_exp, 4))

##                   (Intercept)                   LOG_CAR_AGE 
##                        0.0076                        1.0168 
##           CAR_TYPEPanel Truck                CAR_TYPEPickup 
##                        1.1154                        1.8614 
##            CAR_TYPESports Car                   CAR_TYPESUV 
##                        3.7594                        3.5134 
##                   CAR_TYPEVan                CAR_USEPrivate 
##                        1.6802                        0.4589 
##          EDUCATIONHigh School              EDUCATIONMasters 
##                        1.8230                        0.9552 
##                  EDUCATIONPhD                      HOMEKIDS 
##                        0.8493                        1.0110 
##                JOBBlue Collar                   JOBClerical 
##                        1.4749                        1.7303 
##                     JOBDoctor                 JOBHome Maker 
##                        0.5666                        1.1394 
##                     JOBLawyer                    JOBManager 
##                        1.2372                        0.5114 
##               JOBProfessional                    JOBStudent 
##                        1.2703                        1.0399 
##                    MSTATUSYes                  LOG_OLDCLAIM 
##                        0.6887                        1.0473 
##                    PARENT1Yes                    RED_CARyes 
##                        1.6764                        1.1316 
##                          SEXM                       LOG_TIF 
##                        1.2868                        0.6610 
##                  LOG_TRAVTIME URBANICITYHighly Urban/ Urban 
##                        1.6430                       19.0999 
##                           YOJ                     AGE_YOUNG 
##                        0.9985                        1.7303 
##                       AGE_OLD                     HIGH_RISK 
##                        0.5847                        1.3609 
##                   REVOKED_NUM              CAR_INCOME_RATIO 
##                        1.6591                        1.0001 
##                     HOMEOWNER                 TOTAL_DRIVERS 
##                        0.5931                        1.4880

# Multiple Linear Models

# Function to convert linear model log-coefficients into percent change
interpret_lm <- function(model) {
  coefs <- coef(model)
  pct   <- (exp(coefs) - 1) * 100
  out   <- data.frame(
    Coefficient = round(coefs, 4),
    Percent_Change = round(pct, 2)
  )
  return(out)
}

# Interpretation for LM1
cat("\n=== Coefficient Interpretation (LM1: Basic Model) ===\n")

## 
## === Coefficient Interpretation (LM1: Basic Model) ===

print(interpret_lm(lm_model1))

##                     Coefficient Percent_Change
## (Intercept)              7.9175      274379.50
## SQRT_BLUEBOOK            0.0026           0.26
## LOG_OLDCLAIM             0.0075           0.75
## LOG_INCOME               0.0018           0.18
## CAR_AGE                 -0.0005          -0.05
## CAR_TYPEPanel Truck      0.0727           7.54
## CAR_TYPEPickup           0.0376           3.83
## CAR_TYPESports Car      -0.0078          -0.78
## CAR_TYPESUV              0.0111           1.12
## CAR_TYPEVan              0.0139           1.39
## CAR_USEPrivate           0.0086           0.86
## CLM_FREQ                -0.0429          -4.20
## MVR_PTS                  0.0156           1.57

# Interpretation for LM2
cat("\n=== Coefficient Interpretation (LM2: Full Model) ===\n")

## 
## === Coefficient Interpretation (LM2: Full Model) ===

print(interpret_lm(lm_model2))

##                     Coefficient Percent_Change
## (Intercept)              7.8682      261173.72
## SQRT_BLUEBOOK            0.0024           0.24
## LOG_OLDCLAIM             0.0077           0.77
## LOG_INCOME               0.0108           1.09
## LOG_HOME_VAL            -0.0488          -4.76
## CAR_AGE                 -0.0002          -0.02
## CAR_TYPEPanel Truck      0.0854           8.92
## CAR_TYPEPickup           0.0391           3.99
## CAR_TYPESports Car      -0.0045          -0.45
## CAR_TYPESUV              0.0133           1.34
## CAR_TYPEVan              0.0171           1.72
## CAR_USEPrivate           0.0094           0.94
## CLM_FREQ                -0.0450          -4.40
## MVR_PTS                  0.0152           1.53
## AGE_YOUNG               -0.0139          -1.38
## AGE_OLD                  0.0848           8.85
## HIGH_RISK                0.0047           0.47
## CAR_INCOME_RATIO         0.0000           0.00
## HOMEOWNER                0.5783          78.30
## TOTAL_DRIVERS           -0.0181          -1.80

# Interpretation for LM3
cat("\n=== Coefficient Interpretation (LM3: Log-Transformed) ===\n")

## 
## === Coefficient Interpretation (LM3: Log-Transformed) ===

print(interpret_lm(lm_model3))

##                     Coefficient Percent_Change
## (Intercept)              7.9175      274379.50
## SQRT_BLUEBOOK            0.0026           0.26
## CAR_AGE                 -0.0005          -0.05
## CAR_TYPEPanel Truck      0.0727           7.54
## CAR_TYPEPickup           0.0376           3.83
## CAR_TYPESports Car      -0.0078          -0.78
## CAR_TYPESUV              0.0111           1.12
## CAR_TYPEVan              0.0139           1.39
## LOG_OLDCLAIM             0.0075           0.75
## CLM_FREQ                -0.0429          -4.20
## MVR_PTS                  0.0156           1.57
## LOG_INCOME               0.0018           0.18
## CAR_USEPrivate           0.0086           0.86

# Interpretation for LM4
cat("\n=== Coefficient Interpretation (LM4: Log + New Features) ===\n")

## 
## === Coefficient Interpretation (LM4: Log + New Features) ===

print(interpret_lm(lm_model4))

##                     Coefficient Percent_Change
## (Intercept)              7.8682      261173.72
## SQRT_BLUEBOOK            0.0024           0.24
## LOG_OLDCLAIM             0.0077           0.77
## LOG_INCOME               0.0108           1.09
## LOG_HOME_VAL            -0.0488          -4.76
## CAR_AGE                 -0.0002          -0.02
## CAR_TYPEPanel Truck      0.0854           8.92
## CAR_TYPEPickup           0.0391           3.99
## CAR_TYPESports Car      -0.0045          -0.45
## CAR_TYPESUV              0.0133           1.34
## CAR_TYPEVan              0.0171           1.72
## CAR_USEPrivate           0.0094           0.94
## CLM_FREQ                -0.0450          -4.40
## MVR_PTS                  0.0152           1.53
## AGE_YOUNG               -0.0139          -1.38
## AGE_OLD                  0.0848           8.85
## HIGH_RISK                0.0047           0.47
## CAR_INCOME_RATIO         0.0000           0.00
## HOMEOWNER                0.5783          78.30
## TOTAL_DRIVERS           -0.0181          -1.80

# Interpretation for LM5
cat("\n=== Coefficient Interpretation (LM5: Stepwise AIC Model) ===\n")

## 
## === Coefficient Interpretation (LM5: Stepwise AIC Model) ===

print(interpret_lm(lm_model5))

##               Coefficient Percent_Change
## (Intercept)        7.9361      279543.98
## SQRT_BLUEBOOK      0.0028           0.28
## CLM_FREQ          -0.0226          -2.24
## MVR_PTS            0.0173           1.74

# Interpretation for LM6
cat("\n=== Coefficient Interpretation (LM6: Interaction Model) ===\n")

## 
## === Coefficient Interpretation (LM6: Interaction Model) ===

print(interpret_lm(lm_model6))

##                             Coefficient Percent_Change
## (Intercept)                      7.9398      280591.86
## SQRT_BLUEBOOK                    0.0026           0.26
## LOG_OLDCLAIM                     0.0073           0.74
## LOG_INCOME                       0.0016           0.16
## CAR_AGE                         -0.0005          -0.05
## CAR_TYPEPanel Truck              0.0183           1.84
## CAR_TYPEPickup                   0.0159           1.60
## CAR_TYPESports Car               0.0221           2.23
## CAR_TYPESUV                     -0.0340          -3.34
## CAR_TYPEVan                      0.0292           2.97
## CAR_USEPrivate                   0.0044           0.44
## CLM_FREQ                        -0.0561          -5.46
## AGE                              0.0001           0.01
## MVR_PTS                          0.0084           0.84
## CLM_FREQ:AGE                     0.0003           0.03
## CAR_TYPEPanel Truck:MVR_PTS      0.0224           2.27
## CAR_TYPEPickup:MVR_PTS           0.0082           0.82
## CAR_TYPESports Car:MVR_PTS      -0.0112          -1.12
## CAR_TYPESUV:MVR_PTS              0.0185           1.87
## CAR_TYPEVan:MVR_PTS             -0.0066          -0.65

# Interpretation for LM7
cat("\n=== Coefficient Interpretation (LM7: Polynomial Model) ===\n")

## 
## === Coefficient Interpretation (LM7: Polynomial Model) ===

print(interpret_lm(lm_model7))

##                     Coefficient Percent_Change
## (Intercept)              7.4641      174320.25
## SQRT_BLUEBOOK            0.0111           1.12
## I(SQRT_BLUEBOOK^2)       0.0000           0.00
## CAR_AGE                  0.0059           0.59
## I(CAR_AGE^2)            -0.0003          -0.03
## LOG_INCOME               0.0150           1.51
## I(LOG_INCOME^2)         -0.0013          -0.13
## CLM_FREQ                -0.0235          -2.32
## MVR_PTS                  0.0173           1.75
## CAR_TYPEPanel Truck      0.1809          19.83
## CAR_TYPEPickup           0.0352           3.58
## CAR_TYPESports Car       0.0026           0.26
## CAR_TYPESUV              0.0022           0.22
## CAR_TYPEVan              0.0228           2.30
## CAR_USEPrivate           0.0128           1.29

5 Select Models

Logistic Regression Models:

Recommended: Model 2 (Stepwise on Model 1) Key Strengths:

Highest accuracy (78.98%) Highest Precision and Specificity Lowest AIC (5894.410) - Good balance between model fit and complexity Second highest AUC (0.8066) - Excellent discriminatory power No unusual data points Using only important predictors makes the model more accurate compared to the others

Trade-offs:

Lower sensitivity - the model is a bit biased towards the majority class (not getting into a car crash). Many variables are statistically significant, but not all such as EDUCATIONMasters and EDUCATIONPhD. Only about 64% of the residuals are inside the error bounds, yet that is good number for this batch of models. The points do not seem random though, which means the model could be improved.

Reasoning on not choosing other models:

Model 1 (All Predictors):

Same accuracy as Model 2 (78.98%) Second lowest AIC (5902.53) - includes potentially unnecessary variables Linearity log-odds test did not perform well, with many predictors showing non-linearity (to be expected). Only about 60% of the residuals are inside the error bounds. No unusual data point do exist. May have non-significant predictors Less interpretable with all variables.

Model 1B (All Predictors - SMOTE + Standardized):

Lower accuracy - 70% which makes sense since SMOTE can hurt the accuracy metric Very high AIC, low AUC Linearity log-odds test performed okay, with some predictors showing non-linearity (to be expected). Only about 62% of the residuals are inside the error bounds. No unusual data points exist. May have non-significant predictors Less interpretable with all variables.

Model 2B (Stepwise Selection on Model 1B):

Lower accuracy - 70% which makes sense since SMOTE can hurt the accuracy metric Very high AIC, low AUC Highest sensitivity and F1 score Linearity log-odds test performed okay, with some predictors showing non-linearity (to be expected). Only about 62% of the residuals are inside the error bounds. No unusual data points exist.

Model 3 (Transformed Variables):

Lowest accuracy High AIC despite transformations. Bad performance metrics across the board. Transformations didn’t improve the model. Linearity log-odds test performed okay, with a few predictors showing non-linearity. Only about 58% of the residuals are inside the error bounds. No unusual data points exist.

Model 4 (SMOTE New Features): Lower accuracy - 70% which makes sense since SMOTE can hurt the accuracy metric High AIC High sensitivity and F1 score New features didn’t improve the model. Linearity log-odds test did not perform well, with many predictors showing non-linearity. Only about 60% of the residuals are inside the error bounds. No unusual data points exist.

Model 4B (SMOTE Transformed + New Features): Lower accuracy - 70% which makes sense since SMOTE can hurt the accuracy metric High AIC High sensitivity and F1 score New features didn’t improve the model. Linearity log-odds test did not perform well, with many predictors showing non-linearity. Only about 57% of the residuals are inside the error bounds. No unusual data points exist.

Final Recommendation: Select Model 2 for its optimal balance of:

Statistical rigor (Lowest AIC) Predictive performance (High AUC and highest accuracy) Model simplicity and interpretability Practical usability

Some takeaways from Model 2’s coefficients are:

All coefficients are positive
JOBStudent odds ratio is 1.3381 (greater than 1) meaning the odds of being at risk for a car crash are 1.3 higher than JOBManager (reference variable)
JOBDoctor odds ratio is 0.7506 (less than 1) meaning the odds of being at risk for a car crash are 0.75 lower than JOBManager (reference variable)
CAR_USEPrivate odds ratio is 0.4645 (less than 1) meaning the odds of being at risk for a car crash are 0.4645 lower than CAR_USECommercial (reference variable)
CAR_TYPESports Car odds ratio is 2.7798 (greater than 1) meaning the odds of being at risk for a car crash are 2.7798 greater than CAR_TYPEMinivan (reference variable)
CLM_FREQ is 1.1878 which means for every one-unit increase in CLM_FREQ, the log-odds of a car crash increase by 1.1878, with all other variables constant
INCOME and HOME_VAL are both 1.0 which goes against the theoretical effect from the assignment documentation

Some of the coefficients match our initial insights, yet some do not. This model doesn’t fully capture all of the relationships between the predictors, so there could be some improvement.

# Model 1 Evaluation
pred1_prob <- predict(logit_model1, test, type = "response")
pred1_class <- ifelse(pred1_prob > 0.5, 1, 0)

cm1 <- table(Predicted = pred1_class, Actual = test$TARGET_FLAG)
acc1 <- sum(diag(cm1)) / sum(cm1)
prec1 <- cm1[2,2] / sum(cm1[2,])
sens1 <- cm1[2,2] / sum(cm1[,2])
spec1 <- cm1[1,1] / sum(cm1[,1])
f1_1 <- 2 * (prec1 * sens1) / (prec1 + sens1)
roc1 <- roc(test$TARGET_FLAG, pred1_prob, quiet = TRUE)
auc1 <- auc(roc1)

output1 <- paste("\n=== Model Selection and Evaluation ===\n\n",
                 "=== Model 1 Evaluation ===\n",
                 "Confusion Matrix:\n",
                 paste(capture.output(print(cm1)), collapse = "\n"), "\n",
                 "Accuracy:", round(acc1, 4), "| Precision:", round(prec1, 4), 
                 "| Sensitivity:", round(sens1, 4), "| Specificity:", round(spec1, 4), "\n",
                 "F1 Score:", round(f1_1, 4), "| AUC:", round(auc1, 4), 
                 "| AIC:", round(AIC(logit_model1), 4), "\n\n", sep = " ")
cat(output1)

## 
## === Model Selection and Evaluation ===
## 
##  === Model 1 Evaluation ===
##  Confusion Matrix:
##           Actual
## Predicted    0    1
##         0 1108  249
##         1   94  181 
##  Accuracy: 0.7898 | Precision: 0.6582 | Sensitivity: 0.4209 | Specificity: 0.9218 
##  F1 Score: 0.5135 | AUC: 0.8068 | AIC: 5902.5296

# Diagnostics
# Logistic Regression Assumption Check
# This chunk of code is from Statistical tools for high-throughput data analysis (STHDA)
predictors <- colnames(test)
test_copy <- test
test_copy <- test_copy %>%
  mutate(logit = log(pred1_prob/(1-pred1_prob))) %>%
  gather(key = "predictors", value = "predictor.value", -logit)

ggplot(test_copy, aes(logit, predictor.value)) +
  geom_point(size = 0.5, alpha = 0.5) +
  geom_smooth(method = "loess") +
  theme_bw() +
  facet_wrap(~predictors, scales = "free_y")

## `geom_smooth()` using formula = 'y ~ x'

# Deviance Residuals Plot
deviance_residuals <- sign(test$TARGET_FLAG - pred1_prob) * 
 sqrt(-2 * (test$TARGET_FLAG * log(pred1_prob) + 
 (1 - test$TARGET_FLAG) * log(1 - pred1_prob)))
plot(pred1_prob, deviance_residuals,
  xlab = "Fitted Probabilities",
  ylab = "Deviance Residuals",
  main = "Deviance Residuals vs. Fitted Probabilities")

# Unusual points check
# Calculate the leverage cutoff
# p = # of coeff
p <- length(coef(logit_model1))
# n = # of observations
n <- nrow(train)
# Cut off point high leverage
high_leverage <- (2 * (p+1)) / n
print(high_leverage)

## [1] 0.01164037

plot(logit_model1, which = 4, id.n = 5)

qqnorm(residuals(logit_model1))

halfnorm(hatvalues(logit_model1))

# x <- predict(model1)
# y <- resid(model1)
# binnedplot(x,y)

# Binned residuals
binned_result <- binned_residuals(logit_model1)
binned_result

## Warning: Probably bad model fit. Only about 60% of the residuals are inside the error bounds.

plot(binned_residuals(logit_model1), show_dots = TRUE)

# Model 1B Evaluation
# used train_standardized_smote

pred1b_prob <- predict(logit_model1b, test_standardized, type = "response")
pred1b_class <- ifelse(pred1b_prob > 0.5, 1, 0)

cm1b <- table(Predicted = pred1b_class, Actual = test_standardized$TARGET_FLAG)
acc1b <- sum(diag(cm1b)) / sum(cm1b)
prec1b <- cm1b[2,2] / sum(cm1b[2,])
sens1b <- cm1b[2,2] / sum(cm1b[,2])
spec1b <- cm1b[1,1] / sum(cm1b[,1])
f1_1b <- 2 * (prec1b * sens1b) / (prec1b + sens1b)
roc1b <- roc(test_standardized$TARGET_FLAG, pred1b_prob, quiet = TRUE)
auc1b <- auc(roc1b)

output1b <- paste("\n=== Model Selection and Evaluation ===\n\n",
                 "=== Model 1B Evaluation ===\n",
                 "Confusion Matrix:\n",
                 paste(capture.output(print(cm1b)), collapse = "\n"), "\n",
                 "Accuracy:", round(acc1b, 4), "| Precision:", round(prec1b, 4), 
                 "| Sensitivity:", round(sens1b, 4), "| Specificity:", round(spec1b, 4), "\n",
                 "F1 Score:", round(f1_1b, 4), "| AUC:", round(auc1b, 4), 
                 "| AIC:", round(AIC(logit_model1b), 4), "\n\n", sep = " ")
cat(output1b)

## 
## === Model Selection and Evaluation ===
## 
##  === Model 1B Evaluation ===
##  Confusion Matrix:
##           Actual
## Predicted   0   1
##         0 816 100
##         1 386 330 
##  Accuracy: 0.7022 | Precision: 0.4609 | Sensitivity: 0.7674 | Specificity: 0.6789 
##  F1 Score: 0.5759 | AUC: 0.8012 | AIC: 9928.5196

# Diagnostics
# Logistic Regression Assumption Check
# This chunk of code is from Statistical tools for high-throughput data analysis (STHDA)
predictors <- colnames(test_standardized)
test_standardized_copy <- test_standardized
test_standardized_copy <- test_standardized_copy %>%
  mutate(logit = log(pred1b_prob/(1-pred1b_prob))) %>%
  gather(key = "predictors", value = "predictor.value", -logit)

ggplot(test_standardized_copy, aes(logit, predictor.value)) +
  geom_point(size = 0.5, alpha = 0.5) +
  geom_smooth(method = "loess") +
  theme_bw() +
  facet_wrap(~predictors, scales = "free_y")

## `geom_smooth()` using formula = 'y ~ x'

# Deviance Residuals Plot
# test_standardized_copy$TARGET_FLAG <- as.numeric(as.character(test_standardized$TARGET_FLAG))
# deviance_residuals <- sign(test_standardized_copy$TARGET_FLAG - pred1b_prob) * 
#  sqrt(-2 * (test_standardized_copy$TARGET_FLAG * log(pred1b_prob) + 
#  (1 - test_standardized_copy$TARGET_FLAG) * log(1 - pred1b_prob)))
# plot(pred1b_prob, deviance_residuals,
#   xlab = "Fitted Probabilities",
#   ylab = "Deviance Residuals",
#   main = "Deviance Residuals vs. Fitted Probabilities")

# Unusual points check
# Calculate the leverage cutoff
# p = # of coeff
p <- length(coef(logit_model1b))
# n = # of observations
n <- nrow(train_standardized_smote)
# Cut off point high leverage
high_leverage <- (2 * (p+1)) / n
print(high_leverage)

## [1] 0.007906783

plot(logit_model1b, which = 4, id.n = 5)

qqnorm(residuals(logit_model1b))

halfnorm(hatvalues(logit_model1b))

# x <- predict(model1)
# y <- resid(model1)
# binnedplot(x,y)

# Binned residuals
binned_result <- binned_residuals(logit_model1b)
binned_result

## Warning: Probably bad model fit. Only about 62% of the residuals are inside the error bounds.

plot(binned_residuals(logit_model1b), show_dots = TRUE)

# Model 2 Evaluation
pred2_prob <- predict(logit_model2, test, type = "response")
pred2_class <- ifelse(pred2_prob > 0.5, 1, 0)

cm2 <- table(Predicted = pred2_class, Actual = test$TARGET_FLAG)
acc2 <- sum(diag(cm2)) / sum(cm2)
prec2 <- cm2[2,2] / sum(cm2[2,])
sens2 <- cm2[2,2] / sum(cm2[,2])
spec2 <- cm2[1,1] / sum(cm2[,1])
f1_2 <- 2 * (prec2 * sens2) / (prec2 + sens2)
roc2 <- roc(test$TARGET_FLAG, pred2_prob, quiet = TRUE)
auc2 <- auc(roc2)

output2 <- paste("\n=== Model Selection and Evaluation ===\n\n",
                 "=== Model 2 Evaluation ===\n",
                 "Confusion Matrix:\n",
                 paste(capture.output(print(cm2)), collapse = "\n"), "\n",
                 "Accuracy:", round(acc2, 4), "| Precision:", round(prec2, 4), 
                 "| Sensitivity:", round(sens2, 4), "| Specificity:", round(spec2, 4), "\n",
                 "F1 Score:", round(f1_2, 4), "| AUC:", round(auc2, 4), 
                 "| AIC:", round(AIC(logit_model2), 4), "\n\n", sep = " ")
cat(output2)

## 
## === Model Selection and Evaluation ===
## 
##  === Model 2 Evaluation ===
##  Confusion Matrix:
##           Actual
## Predicted    0    1
##         0 1111  252
##         1   91  178 
##  Accuracy: 0.7898 | Precision: 0.6617 | Sensitivity: 0.414 | Specificity: 0.9243 
##  F1 Score: 0.5093 | AUC: 0.8066 | AIC: 5894.4099

# Diagnostics
# Logistic Regression Assumption Check
# This chunk of code is from Statistical tools for high-throughput data analysis (STHDA)
predictors <- colnames(test)
test_copy <- test
test_copy <- test_copy %>%
  mutate(logit = log(pred2_prob/(1-pred2_prob))) %>%
  gather(key = "predictors", value = "predictor.value", -logit)

ggplot(test_copy, aes(logit, predictor.value)) +
  geom_point(size = 0.5, alpha = 0.5) +
  geom_smooth(method = "loess") +
  theme_bw() +
  facet_wrap(~predictors, scales = "free_y")

## `geom_smooth()` using formula = 'y ~ x'

# Deviance Residuals Plot
deviance_residuals <- sign(test$TARGET_FLAG - pred2_prob) * 
 sqrt(-2 * (test$TARGET_FLAG * log(pred2_prob) + 
 (1 - test$TARGET_FLAG) * log(1 - pred2_prob)))
plot(pred2_prob, deviance_residuals,
  xlab = "Fitted Probabilities",
  ylab = "Deviance Residuals",
  main = "Deviance Residuals vs. Fitted Probabilities")

# Unusual points check
# Calculate the leverage cutoff
# p = # of coeff
p <- length(coef(logit_model2))
# n = # of observations
n <- nrow(train)
# Cut off point high leverage
high_leverage <- (2 * (p+1)) / n
print(high_leverage)

## [1] 0.01010875

plot(logit_model2, which = 4, id.n = 5)

qqnorm(residuals(logit_model2))

halfnorm(hatvalues(logit_model2))

# x <- predict(model1)
# y <- resid(model1)
# binnedplot(x,y)

# Binned residuals
binned_result <- binned_residuals(logit_model2)
binned_result

## Warning: Probably bad model fit. Only about 64% of the residuals are inside the error bounds.

plot(binned_residuals(logit_model2), show_dots = TRUE)

# Model 2B Evaluation
pred2b_prob <- predict(logit_model2b, test_standardized, type = "response")
pred2b_class <- ifelse(pred2b_prob > 0.5, 1, 0)

cm2b <- table(Predicted = pred2b_class, Actual = test_standardized$TARGET_FLAG)
acc2b <- sum(diag(cm2b)) / sum(cm2b)
prec2b <- cm2b[2,2] / sum(cm2b[2,])
sens2b <- cm2b[2,2] / sum(cm2b[,2])
spec2b <- cm2b[1,1] / sum(cm2b[,1])
f1_2b <- 2 * (prec2b * sens2b) / (prec2b + sens2b)
roc2b <- roc(test_standardized$TARGET_FLAG, pred2b_prob, quiet = TRUE)
auc2b <- auc(roc2b)

output2b <- paste("\n=== Model Selection and Evaluation ===\n\n",
                 "=== Model 2B Evaluation ===\n",
                 "Confusion Matrix:\n",
                 paste(capture.output(print(cm2b)), collapse = "\n"), "\n",
                 "Accuracy:", round(acc2b, 4), "| Precision:", round(prec2b, 4), 
                 "| Sensitivity:", round(sens2b, 4), "| Specificity:", round(spec2b, 4), "\n",
                 "F1 Score:", round(f1_2b, 4), "| AUC:", round(auc2b, 4), 
                 "| AIC:", round(AIC(logit_model2b), 4), "\n\n", sep = " ")
cat(output2b)

## 
## === Model Selection and Evaluation ===
## 
##  === Model 2B Evaluation ===
##  Confusion Matrix:
##           Actual
## Predicted   0   1
##         0 816  99
##         1 386 331 
##  Accuracy: 0.7028 | Precision: 0.4616 | Sensitivity: 0.7698 | Specificity: 0.6789 
##  F1 Score: 0.5772 | AUC: 0.8012 | AIC: 9921.3848

# Diagnostics
# Logistic Regression Assumption Check
# This chunk of code is from Statistical tools for high-throughput data analysis (STHDA)
predictors <- colnames(test_standardized)
test_standardized_copy <- test_standardized
test_standardized_copy <- test_standardized_copy %>%
  mutate(logit = log(pred2b_prob/(1-pred2b_prob))) %>%
  gather(key = "predictors", value = "predictor.value", -logit)

ggplot(test_standardized_copy, aes(logit, predictor.value)) +
  geom_point(size = 0.5, alpha = 0.5) +
  geom_smooth(method = "loess") +
  theme_bw() +
  facet_wrap(~predictors, scales = "free_y")

## `geom_smooth()` using formula = 'y ~ x'

# Deviance Residuals Plot
# test_standardized_copy$TARGET_FLAG <- as.numeric(as.character(test_standardized$TARGET_FLAG))
# deviance_residuals <- sign(test_standardized$TARGET_FLAG - pred2b_prob) * 
#  sqrt(-2 * (test_standardized$TARGET_FLAG * log(pred2b_prob) + 
#  (1 - test_standardized$TARGET_FLAG) * log(1 - pred2b_prob)))
# plot(pred2b_prob, deviance_residuals,
#   xlab = "Fitted Probabilities",
#   ylab = "Deviance Residuals",
#   main = "Deviance Residuals vs. Fitted Probabilities")

# Unusual points check
# Calculate the leverage cutoff
# p = # of coeff
p <- length(coef(logit_model2b))
# n = # of observations
n <- nrow(train_standardized_smote)
# Cut off point high leverage
high_leverage <- (2 * (p+1)) / n
print(high_leverage)

## [1] 0.00707449

plot(logit_model2b, which = 4, id.n = 5)

qqnorm(residuals(logit_model2b))

halfnorm(hatvalues(logit_model2b))

# x <- predict(model1)
# y <- resid(model1)
# binnedplot(x,y)

# Binned residuals
binned_result <- binned_residuals(logit_model2b)
binned_result

## Warning: Probably bad model fit. Only about 61% of the residuals are inside the error bounds.

plot(binned_residuals(logit_model2b), show_dots = TRUE)

# Model 3 Evaluation
# used train_transformed_smote

pred3_prob <- predict(logit_model3, test_transformed, type = "response")
pred3_class <- ifelse(pred3_prob > 0.5, 1, 0)

cm3 <- table(Predicted = pred3_prob, Actual = test_transformed$TARGET_FLAG)
acc3 <- sum(diag(cm3)) / sum(cm3)
prec3 <- cm3[2,2] / sum(cm3[2,])
sens3 <- cm3[2,2] / sum(cm3[,2])
spec3 <- cm3[1,1] / sum(cm3[,1])
f1_3 <- 2 * (prec3 * sens3) / (prec3 + sens3)
roc3 <- roc(test_transformed$TARGET_FLAG, pred3_prob, quiet = TRUE)
auc3 <- auc(roc3)

output3 <- paste("\n=== Model Selection and Evaluation ===\n\n",
                 "=== Model 3 Evaluation ===\n",
                 "Confusion Matrix:\n",
                 paste(capture.output(print(cm3)), collapse = "\n"), "\n",
                 "Accuracy:", round(acc3, 4), "| Precision:", round(prec3, 4), 
                 "| Sensitivity:", round(sens3, 4), "| Specificity:", round(spec3, 4), "\n",
                 "F1 Score:", round(f1_3, 4), "| AUC:", round(auc3, 4), 
                 "| AIC:", round(AIC(logit_model3), 4), "\n\n", sep = " ")
cat(output3)

## 
## === Model Selection and Evaluation ===
## 
##  === Model 3 Evaluation ===
##  Confusion Matrix:
##                       Actual
## Predicted             0 1
##   0.0043450640303164  1 0
##   0.00548118389492045 1 0
##   0.00603260692980135 1 0
##   0.00733333734929783 1 0
##   0.00734985534634293 1 0
##   0.00841119733881633 1 0
##   0.00850377098263108 1 0
##   0.00893880815333163 1 0
##   0.00961494054953838 1 0
##   0.0106206477270655  1 0
##   0.0115051137067527  1 0
##   0.0119359936649579  1 0
##   0.0123637355391004  1 0
##   0.0124804451548155  1 0
##   0.0134647938844311  1 0
##   0.0139649063310701  1 0
##   0.0144821960193005  1 0
##   0.014572564897938   1 0
##   0.0148181192871263  1 0
##   0.0172915974590451  1 0
##   0.0173094520234348  1 0
##   0.0174635580661577  1 0
##   0.0178701844233386  1 0
##   0.0179802447228864  1 0
##   0.0183179929326739  1 0
##   0.0183301643697439  1 0
##   0.0189022794784174  1 0
##   0.0190405820730471  1 0
##   0.0190788051204536  1 0
##   0.0196246201643638  1 0
##   0.0199277755213452  1 0
##   0.0203042063564042  1 0
##   0.0208881045326181  1 0
##   0.0209174457401389  1 0
##   0.021164417041167   1 0
##   0.021680654063061   1 0
##   0.0218753127862605  1 0
##   0.02192159020256    1 0
##   0.023140864043415   1 0
##   0.0232877508100668  1 0
##   0.0234728396071769  1 0
##   0.0236800718206023  1 0
##   0.0238229459203795  1 0
##   0.0238293427115527  1 0
##   0.0240034244378145  1 0
##   0.0241441774191012  1 0
##   0.0247748845294795  1 0
##   0.0255196092990607  1 0
##   0.0255366920268493  1 0
##   0.0255891119386221  1 0
##   0.0266551074705439  1 0
##   0.027041509817244   1 0
##   0.0271247010559014  1 0
##   0.0274599875757751  1 0
##   0.0274748593637815  1 0
##   0.0275622582729967  1 0
##   0.0281239809211327  1 0
##   0.028907623376555   1 0
##   0.0292714846820877  1 0
##   0.0308988593107564  1 0
##   0.0310475691314494  1 0
##   0.031317696669368   1 0
##   0.0316421432605079  1 0
##   0.033258687652195   1 0
##   0.0339839178619352  1 0
##   0.0343033856051907  1 0
##   0.034992941888265   1 0
##   0.0351839007501007  0 1
##   0.0352344043487826  1 0
##   0.0362296087590743  1 0
##   0.0363172243583794  1 0
##   0.0370795978428256  1 0
##   0.0380804798632935  1 0
##   0.038570884213792   1 0
##   0.0386567670614433  1 0
##   0.0386698581514538  1 0
##   0.0391716511654874  1 0
##   0.0402736947847299  1 0
##   0.0409528908933756  1 0
##   0.04108998070083    1 0
##   0.0411953460823376  1 0
##   0.0412681759346754  1 0
##   0.0414006274343846  1 0
##   0.0420431928497193  1 0
##   0.042570556060408   1 0
##   0.0427839370931582  1 0
##   0.0428573573321979  1 0
##   0.0430075511174482  1 0
##   0.0431977684302351  1 0
##   0.0433742701828072  1 0
##   0.0435368975346778  1 0
##   0.043740138372615   1 0
##   0.0441083453940425  1 0
##   0.0442936132049535  1 0
##   0.0446348479795504  1 0
##   0.0449172462531075  1 0
##   0.0452776870901382  1 0
##   0.0452803349735642  1 0
##   0.0454282364404266  1 0
##   0.0458649485705862  1 0
##   0.0459560532189658  1 0
##   0.0461717676863682  1 0
##   0.0461780776268159  1 0
##   0.0466668086841772  1 0
##   0.0471967745249054  1 0
##   0.0473746884182426  1 0
##   0.0477233285089765  1 0
##   0.0481888855439003  0 1
##   0.0484159224459462  1 0
##   0.0485138814779803  1 0
##   0.0485472268881247  1 0
##   0.0493224795015144  1 0
##   0.0498942728318064  1 0
##   0.0499938050420441  1 0
##   0.0503787534177266  1 0
##   0.0505265761323745  1 0
##   0.0508633615960059  1 0
##   0.0511496237214766  1 0
##   0.0519434379036144  1 0
##   0.0521254131778552  1 0
##   0.0528403861357737  0 1
##   0.0531157879392423  1 0
##   0.0531748499051658  1 0
##   0.0534236430869222  1 0
##   0.053462074882913   1 0
##   0.0536748661888244  1 0
##   0.0541032109136431  1 0
##   0.0543102123754178  1 0
##   0.054517683856248   1 0
##   0.0546065133535933  1 0
##   0.0548214620783846  1 0
##   0.0549008085103754  1 0
##   0.055309921839558   1 0
##   0.0554117819189921  1 0
##   0.0568982746072028  1 0
##   0.0576957990106816  1 0
##   0.0576982691831813  1 0
##   0.0577102927186197  1 0
##   0.0581716936513815  1 0
##   0.0585014553028622  1 0
##   0.0588632638392874  1 0
##   0.0593427677807326  1 0
##   0.0595713167620332  1 0
##   0.0599185247919047  1 0
##   0.0600413929440195  1 0
##   0.0601405594880306  1 0
##   0.0607228444386821  1 0
##   0.060843052724191   1 0
##   0.0615033231582872  1 0
##   0.0616607362053094  1 0
##   0.0617666595082288  1 0
##   0.0620319411195661  1 0
##   0.0621653810999843  1 0
##   0.0624185986385446  1 0
##   0.0627583037499304  1 0
##   0.0628813467765347  1 0
##   0.0632373099764839  1 0
##   0.0641838448179374  1 0
##   0.0644279826599898  1 0
##   0.0652888315175425  1 0
##   0.0653773071696994  1 0
##   0.0654532556846517  0 1
##   0.0654640789466458  1 0
##   0.0656402914011337  1 0
##   0.0658493093804315  1 0
##   0.0659875114589071  1 0
##   0.0674151444596291  1 0
##   0.068599504649354   1 0
##   0.0697021915296012  1 0
##   0.0697728868777315  1 0
##   0.0701756330485323  1 0
##   0.0705359830226334  1 0
##   0.0706057369130368  1 0
##   0.0708571899541076  1 0
##   0.0710765083529384  1 0
##   0.071109395214305   1 0
##   0.0712484318599215  1 0
##   0.0719255484904502  1 0
##   0.0719861006345996  1 0
##   0.0721759618379751  1 0
##   0.0721947733810354  1 0
##   0.0725289055163552  1 0
##   0.0727942486098853  1 0
##   0.0735117961387831  1 0
##   0.0739040040412355  1 0
##   0.0741381930542904  1 0
##   0.0742729913965815  1 0
##   0.0743809631152703  1 0
##   0.0743960954147334  1 0
##   0.074450657530765   1 0
##   0.0746211903514512  1 0
##   0.0751398763826475  1 0
##   0.0754055895457704  1 0
##   0.0768804501104805  1 0
##   0.0769160354588382  1 0
##   0.0775725372262317  1 0
##   0.0779962089314761  1 0
##   0.0785543689257193  1 0
##   0.080156245459725   1 0
##   0.0803325341180373  1 0
##   0.0803333111176047  1 0
##   0.0803436577043621  1 0
##   0.080724856862391   1 0
##   0.0810065546621375  1 0
##   0.0810623191283536  1 0
##   0.0813015367095929  1 0
##   0.0816835487256298  1 0
##   0.081690836261307   1 0
##   0.0826225542632032  1 0
##   0.0833255323787749  1 0
##   0.0838274154402208  1 0
##   0.0840215943025964  1 0
##   0.0845362624739066  1 0
##   0.0852375680667982  1 0
##   0.0855092175861079  1 0
##   0.0865899070078404  1 0
##   0.0866251652812674  1 0
##   0.0873481724427881  1 0
##   0.0875359408574083  0 1
##   0.088090311360591   1 0
##   0.0884632129854223  1 0
##   0.0890068990973343  1 0
##   0.0895080386320612  1 0
##   0.0897394805250259  0 1
##   0.090039599052079   1 0
##   0.0902585992704075  1 0
##   0.0909748356674218  1 0
##   0.0918394731765355  1 0
##   0.0920629113470721  1 0
##   0.0922452902318872  1 0
##   0.0923621148112134  1 0
##   0.0925011931694846  1 0
##   0.0927085897336666  1 0
##   0.0928345809898075  1 0
##   0.0933346589786223  1 0
##   0.093942874895759   1 0
##   0.0946144991815512  1 0
##   0.0947152487073446  1 0
##   0.0951810661824591  1 0
##   0.0954171893490574  1 0
##   0.0959011552885233  1 0
##   0.0965802947050795  1 0
##   0.096602082102425   1 0
##   0.0967155459427595  1 0
##   0.0970402300555142  1 0
##   0.0977008384847017  1 0
##   0.0992291633544682  1 0
##   0.0992558260189351  1 0
##   0.0996434604492718  1 0
##   0.0997002684982893  1 0
##   0.101118087445192   1 0
##   0.101124645691845   1 0
##   0.101306086821059   0 1
##   0.101899895367542   1 0
##   0.102683209564453   1 0
##   0.102791111818007   1 0
##   0.102920532314135   1 0
##   0.103086814020836   1 0
##   0.103688282827542   1 0
##   0.103882418031454   1 0
##   0.105167628351395   1 0
##   0.106170202187353   1 0
##   0.106452837956327   1 0
##   0.107041251212009   1 0
##   0.107142434194774   1 0
##   0.107327163279765   1 0
##   0.107583160999352   1 0
##   0.107941995692576   1 0
##   0.10799776834856    1 0
##   0.108180655284829   1 0
##   0.10879851217787    1 0
##   0.108966663390874   1 0
##   0.109297282528284   1 0
##   0.110276367233862   1 0
##   0.110322148772063   1 0
##   0.110589567651367   1 0
##   0.110891299386818   1 0
##   0.111509728309983   1 0
##   0.111522671006003   1 0
##   0.111820162588813   1 0
##   0.112548970862874   1 0
##   0.112564825565952   1 0
##   0.113196229111365   1 0
##   0.113256331113125   1 0
##   0.115285864928583   1 0
##   0.11559348277664    1 0
##   0.115673180651116   1 0
##   0.115909714989749   1 0
##   0.116050001998935   1 0
##   0.116230415310152   1 0
##   0.116269558500039   0 1
##   0.116322351767939   1 0
##   0.116609038611352   1 0
##   0.117118108833456   1 0
##   0.118103506659273   1 0
##   0.118328475126595   1 0
##   0.118868526497029   1 0
##   0.118879531954356   1 0
##   0.119236956274427   0 1
##   0.119625697938171   1 0
##   0.120636031102626   1 0
##   0.120636523640429   1 0
##   0.120659411499055   1 0
##   0.121208309513244   1 0
##   0.121340298584949   1 0
##   0.121604673399378   1 0
##   0.121874142461947   1 0
##   0.122278373929932   1 0
##   0.122805940046905   1 0
##   0.123881929407622   1 0
##   0.124492701610665   1 0
##   0.124781901152856   1 0
##   0.126511936855614   1 0
##   0.126984779360731   0 1
##   0.127577596766487   1 0
##   0.128276731909543   1 0
##   0.128564563823368   1 0
##   0.129398667424211   1 0
##   0.130053367620783   1 0
##   0.130419206771841   1 0
##   0.130433029828566   1 0
##   0.130692414705335   1 0
##   0.131924341765218   1 0
##   0.132477587374681   1 0
##   0.13270097217815    1 0
##   0.132957937247028   1 0
##   0.133881556088568   1 0
##   0.134762297472795   1 0
##   0.134922014859128   1 0
##   0.135071958855886   1 0
##   0.135251298092995   1 0
##   0.13591818796972    1 0
##   0.136364456232086   1 0
##   0.136839760789029   1 0
##   0.136997915768727   0 1
##   0.137006902297379   1 0
##   0.137775288656182   1 0
##   0.139417171730824   1 0
##   0.139520143466967   1 0
##   0.141797300389458   1 0
##   0.142377670298455   0 1
##   0.1425989507247     1 0
##   0.142818341831208   1 0
##   0.143176156207845   1 0
##   0.14332390746445    1 0
##   0.143353675339126   1 0
##   0.144651662495046   1 0
##   0.144729218068292   1 0
##   0.144741859951677   1 0
##   0.14573790674812    1 0
##   0.14653823194747    1 0
##   0.146987268081827   1 0
##   0.147118519324625   1 0
##   0.147227587322097   1 0
##   0.148037822190738   1 0
##   0.148116160109799   1 0
##   0.148151654245868   0 1
##   0.148669716438722   1 0
##   0.148704067723797   1 0
##   0.149028368052609   1 0
##   0.149335885521937   1 0
##   0.149875615461593   1 0
##   0.14993339159303    1 0
##   0.149967175933454   1 0
##   0.150297159058636   1 0
##   0.150837549870428   0 1
##   0.150869871984607   1 0
##   0.150946241176695   1 0
##   0.151092999551204   1 0
##   0.15242910448158    1 0
##   0.153434363057387   1 0
##   0.153541097286828   1 0
##   0.153555179471941   1 0
##   0.154186437374784   1 0
##   0.154331246620354   1 0
##   0.154441363604107   1 0
##   0.154884849460521   1 0
##   0.154932371899932   0 1
##   0.155500329900552   1 0
##   0.155838293423761   1 0
##   0.156397210031652   1 0
##   0.158838146448924   1 0
##   0.159552910756703   1 0
##   0.159676633810647   1 0
##   0.160062907035066   1 0
##   0.161128857948829   1 0
##   0.16200686380494    1 0
##   0.162587161072347   1 0
##   0.163042991200799   1 0
##   0.163223168468235   1 0
##   0.163671320613617   0 1
##   0.163997401194159   1 0
##   0.164161160179712   1 0
##   0.165075303932799   1 0
##   0.165805994071252   1 0
##   0.166834915902136   1 0
##   0.167205394259133   1 0
##   0.167274515519597   1 0
##   0.167608716430589   1 0
##   0.168193831046201   1 0
##   0.169121595634989   1 0
##   0.170115974076478   1 0
##   0.170647197079099   1 0
##   0.171757235082561   1 0
##   0.171968680255511   1 0
##   0.172626954806562   1 0
##   0.172769069101298   1 0
##   0.172978083931802   1 0
##   0.173184135460958   1 0
##   0.174789158906714   1 0
##   0.174894886624047   1 0
##   0.175350911968562   1 0
##   0.175572565278081   1 0
##   0.175904250265614   0 1
##   0.176959372093831   1 0
##   0.177298381067101   1 0
##   0.178486233362458   1 0
##   0.179432523930465   1 0
##   0.180013106428386   1 0
##   0.180716763172473   1 0
##   0.181056533322009   1 0
##   0.181377279999355   1 0
##   0.181674481872902   1 0
##   0.182384433745322   1 0
##   0.183090651325912   0 1
##   0.18315587639948    1 0
##   0.18392449784375    0 1
##   0.184377838207847   1 0
##   0.185261826407604   1 0
##   0.18577049800837    1 0
##   0.186185532177076   1 0
##   0.186251636178302   0 1
##   0.187553166356961   1 0
##   0.187612794915833   1 0
##   0.187733448223436   1 0
##   0.188210088253734   1 0
##   0.188227893618663   1 0
##   0.188489391045307   1 0
##   0.188992115275008   1 0
##   0.189078041435105   1 0
##   0.189700335821874   1 0
##   0.190228010033378   1 0
##   0.190317488565338   1 0
##   0.191261451910399   1 0
##   0.191754556088274   1 0
##   0.19208767981724    1 0
##   0.194033407415872   1 0
##   0.194227730927719   1 0
##   0.195327753325996   1 0
##   0.195550457324956   1 0
##   0.196499289671336   1 0
##   0.196986812760199   1 0
##   0.197593845714678   1 0
##   0.198593937016482   1 0
##   0.201310953202766   1 0
##   0.201792097775673   1 0
##   0.202102934248306   1 0
##   0.202765360878617   1 0
##   0.203587731995559   1 0
##   0.203809887361546   1 0
##   0.203865220716461   1 0
##   0.205469276817864   1 0
##   0.205932434605792   1 0
##   0.20650791996897    1 0
##   0.206762598765821   1 0
##   0.207127055823255   1 0
##   0.207473614615594   1 0
##   0.207805583279259   1 0
##   0.207867186150427   0 1
##   0.208260417905097   0 1
##   0.208374956540192   0 1
##   0.209045732588195   1 0
##   0.209361328648308   0 1
##   0.209883191446301   1 0
##   0.210204334388271   1 0
##   0.210247112022364   1 0
##   0.211244495503958   1 0
##   0.21153435212561    1 0
##   0.213398667235787   1 0
##   0.214537236413332   1 0
##   0.214949389095165   1 0
##   0.215048857957281   1 0
##   0.215148020109937   1 0
##   0.216087215298581   1 0
##   0.216156062696148   1 0
##   0.217825066696484   1 0
##   0.21972701669463    1 0
##   0.220108505161672   1 0
##   0.220598201286563   1 0
##   0.221627528071474   1 0
##   0.221941007658376   1 0
##   0.222000291571121   0 1
##   0.222026137267443   1 0
##   0.222983372301721   1 0
##   0.224169483518742   1 0
##   0.225582546293017   1 0
##   0.226818611833896   1 0
##   0.227774365665894   0 1
##   0.227862485699884   1 0
##   0.228123617022141   1 0
##   0.229223341723853   1 0
##   0.229223813866896   1 0
##   0.230364613101179   1 0
##   0.23078095099384    1 0
##   0.231664980879792   1 0
##   0.231960779331491   0 1
##   0.232130081660389   1 0
##   0.233744681266437   1 0
##   0.23448599006744    1 0
##   0.234516048303971   1 0
##   0.23458906253135    1 0
##   0.23508921528194    1 0
##   0.235498133411209   1 0
##   0.235624123754117   1 0
##   0.236238215336704   1 0
##   0.236345595234025   1 0
##   0.237202097391907   1 0
##   0.237283621991334   1 0
##   0.237641467818313   1 0
##   0.238094427272758   0 1
##   0.238184461575249   1 0
##   0.238186465391107   1 0
##   0.238610871369191   1 0
##   0.238793764621552   0 1
##   0.238909087381221   1 0
##   0.23899097795774    1 0
##   0.239107055497477   1 0
##   0.240002010126678   1 0
##   0.242331005377301   0 1
##   0.242555790413495   1 0
##   0.242877004026021   1 0
##   0.243601358289784   1 0
##   0.244034677733344   0 1
##   0.244342182190622   1 0
##   0.244396184754336   1 0
##   0.245042283686668   0 1
##   0.246144901452149   1 0
##   0.246205800063168   1 0
##   0.246415503750596   1 0
##   0.24721471692785    1 0
##   0.247358349596312   1 0
##   0.247596468311225   1 0
##   0.247684210660397   0 1
##   0.247753035224906   1 0
##   0.247958966125242   1 0
##   0.248744310105128   1 0
##   0.250050544784554   1 0
##   0.250638776494687   1 0
##   0.251386423713738   1 0
##   0.252188187972785   1 0
##   0.253180677755712   1 0
##   0.253588867213943   1 0
##   0.254582159228367   1 0
##   0.25545796110048    1 0
##   0.256230416021659   1 0
##   0.256740967661657   0 1
##   0.256945078830193   1 0
##   0.256989617062126   1 0
##   0.258559301297855   1 0
##   0.258888407440929   1 0
##   0.259390025885357   0 1
##   0.259885772838366   1 0
##   0.260059948624813   1 0
##   0.261248960093745   1 0
##   0.262180150969142   1 0
##   0.263424580499675   1 0
##   0.26365468668274    1 0
##   0.263982337269438   1 0
##   0.264075629839116   1 0
##   0.265589049876142   0 1
##   0.266456197788181   1 0
##   0.266677471934553   0 1
##   0.267051576331172   1 0
##   0.26739689029681    1 0
##   0.268519776183927   1 0
##   0.268665157553015   1 0
##   0.269814819938563   1 0
##   0.272074824502528   0 1
##   0.272474472241424   0 1
##   0.273223085295698   1 0
##   0.274008691868202   1 0
##   0.2741048240049     1 0
##   0.275490412113186   1 0
##   0.278290220179738   1 0
##   0.278517025338815   1 0
##   0.279050009475559   1 0
##   0.280082224517613   0 1
##   0.280335810425223   1 0
##   0.281498480834251   0 1
##   0.281642110416598   1 0
##   0.282194195916108   0 1
##   0.282450279569639   1 0
##   0.282526432197104   1 0
##   0.282861897500307   1 0
##   0.282907455048703   1 0
##   0.283195192936742   1 0
##   0.283824764301363   1 0
##   0.283939716458778   1 0
##   0.284108835967771   1 0
##   0.284682086458441   1 0
##   0.284805633720846   1 0
##   0.285474811720194   1 0
##   0.285605477067913   1 0
##   0.2861884085252     1 0
##   0.286910179931972   1 0
##   0.287570270501365   1 0
##   0.288700384148592   1 0
##   0.288963748503391   1 0
##   0.290000850454228   0 1
##   0.290107878406945   1 0
##   0.290620411179741   0 1
##   0.290833708866647   1 0
##   0.290838609420182   0 1
##   0.291746486041863   1 0
##   0.291839435669802   1 0
##   0.292331562760923   0 1
##   0.292589051826604   1 0
##   0.292770645125798   1 0
##   0.292881534880031   1 0
##   0.293748534706322   1 0
##   0.294034527117479   1 0
##   0.294571891917296   1 0
##   0.295260275379631   1 0
##   0.29526863880331    1 0
##   0.295907513682998   1 0
##   0.296875845467626   1 0
##   0.298484840273468   1 0
##   0.299059201742104   1 0
##   0.302317811913364   1 0
##   0.30276823056705    1 0
##   0.302810009870252   1 0
##   0.303070597751283   1 0
##   0.303675821709721   1 0
##   0.304764148069921   1 0
##   0.305753700598275   1 0
##   0.305793676926244   1 0
##   0.306383501768066   1 0
##   0.306400664945898   1 0
##   0.308255848136322   1 0
##   0.308922274718679   1 0
##   0.309047933322718   1 0
##   0.309176771329972   1 0
##   0.309880854065518   1 0
##   0.310260512763607   1 0
##   0.310370154864691   1 0
##   0.310477170344382   1 0
##   0.310504362675759   1 0
##   0.310672050570492   1 0
##   0.310788580962381   1 0
##   0.312005820726876   1 0
##   0.312207271014437   1 0
##   0.312320025462398   1 0
##   0.312931707708075   1 0
##   0.313623405480822   1 0
##   0.313772257762296   1 0
##   0.313983151312877   1 0
##   0.316217552979905   1 0
##   0.316303216677558   1 0
##   0.317843182335146   1 0
##   0.319110171480977   1 0
##   0.320104171909462   1 0
##   0.320598695820398   1 0
##   0.320762122986378   1 0
##   0.32089358923903    1 0
##   0.321736541742673   0 1
##   0.322378511755569   0 1
##   0.323718782183982   1 0
##   0.323837103638694   1 0
##   0.324308100498387   1 0
##   0.324361704250274   1 0
##   0.324914567063737   1 0
##   0.325140924924987   1 0
##   0.325545112381501   0 1
##   0.325648112001945   0 1
##   0.327421727604358   1 0
##   0.328390013851728   1 0
##   0.328574458930776   1 0
##   0.328859062145726   1 0
##   0.329018226284834   0 1
##   0.329152239593019   1 0
##   0.330121965604674   1 0
##   0.330725750585893   0 1
##   0.330740211423223   0 1
##   0.332202973157174   1 0
##   0.332421379789376   0 1
##   0.33258401118239    1 0
##   0.333366501604691   1 0
##   0.333524366790632   0 1
##   0.333691305267432   1 0
##   0.333839800043504   1 0
##   0.334652461609095   1 0
##   0.334820291646238   1 0
##   0.336518676090831   1 0
##   0.33654212705585    1 0
##   0.336899436993254   1 0
##   0.3372100150437     1 0
##   0.338371922744575   1 0
##   0.34106567794492    0 1
##   0.34114190712005    1 0
##   0.341166536745515   0 1
##   0.341628301729176   1 0
##   0.342202575335897   1 0
##   0.342414785079572   1 0
##   0.342894070476492   0 1
##   0.343206464207288   1 0
##   0.343245402990988   1 0
##   0.343263780336555   1 0
##   0.343668773999272   1 0
##   0.343970061451831   0 1
##   0.346310695893428   1 0
##   0.346436313905196   1 0
##   0.347398048941468   1 0
##   0.347614222779416   0 1
##   0.348782012845747   1 0
##   0.349091215988791   1 0
##   0.349563742872993   0 1
##   0.350872728558706   1 0
##   0.351649653484283   1 0
##   0.352499404368112   1 0
##   0.352549112069321   1 0
##   0.352660424574247   1 0
##   0.353741627633065   1 0
##   0.354008367031904   1 0
##   0.354035852241729   1 0
##   0.355406144165313   0 1
##   0.355980705541544   1 0
##   0.356827329805245   1 0
##   0.357647660290997   1 0
##   0.357723132423873   1 0
##   0.357892001855743   1 0
##   0.358694585740932   1 0
##   0.358803543727632   1 0
##   0.358855158167179   1 0
##   0.359776993404916   0 1
##   0.35983562390309    1 0
##   0.360073226384876   1 0
##   0.360349102578109   0 1
##   0.360728544497551   1 0
##   0.361557608569289   1 0
##   0.361566647274968   1 0
##   0.36247296435718    1 0
##   0.362731291105015   1 0
##   0.363110948424018   1 0
##   0.363389781903547   0 1
##   0.363396173466628   1 0
##   0.363766196738983   0 1
##   0.363956038223128   0 1
##   0.364177544719171   1 0
##   0.364543996938953   1 0
##   0.364669091021966   1 0
##   0.366291331270728   1 0
##   0.366406883476725   1 0
##   0.366804411156015   0 1
##   0.366861767977153   1 0
##   0.367065918315696   0 1
##   0.367912922177126   1 0
##   0.368578660995826   1 0
##   0.368949650178813   1 0
##   0.369059134242083   1 0
##   0.370110938384361   1 0
##   0.370684011031124   1 0
##   0.370850489876914   1 0
##   0.371536277111894   1 0
##   0.371673897428339   1 0
##   0.37256512500293    1 0
##   0.372908603599921   1 0
##   0.373692926731864   1 0
##   0.373856753761749   0 1
##   0.374577059612744   0 1
##   0.374797937007975   1 0
##   0.375218877936012   1 0
##   0.379364439816496   1 0
##   0.380170344267296   1 0
##   0.380535249049662   0 1
##   0.381528687887668   0 1
##   0.382309820284312   1 0
##   0.38248789776742    0 1
##   0.383149064094703   1 0
##   0.38332832318265    1 0
##   0.383834601643755   1 0
##   0.384481253464663   1 0
##   0.38495091560948    1 0
##   0.384990074364133   1 0
##   0.386972016433229   1 0
##   0.387000044021392   1 0
##   0.387289821040705   0 1
##   0.387465224796093   1 0
##   0.387674881048289   1 0
##   0.388055272175561   1 0
##   0.388819780507469   1 0
##   0.388888880563412   0 1
##   0.389223969412461   1 0
##   0.389400474117732   1 0
##   0.3904783786144     1 0
##   0.390541224506644   0 1
##   0.393005787146447   1 0
##   0.39395834462488    1 0
##   0.394908298416437   1 0
##   0.395162845261347   1 0
##   0.395931608586218   1 0
##   0.396297284279095   1 0
##   0.397225025903957   0 1
##   0.397278496130861   1 0
##   0.397850475680023   1 0
##   0.397983890512471   1 0
##   0.399435841603254   1 0
##   0.399792415783326   1 0
##   0.400847948732831   1 0
##   0.402274695491934   1 0
##   0.403082070340727   0 1
##   0.403191954785991   0 1
##   0.403856460466572   1 0
##   0.404288611954929   1 0
##   0.404394759488671   1 0
##   0.405340245070948   1 0
##   0.40575329850563    1 0
##   0.406421203510916   0 1
##   0.406709544391611   1 0
##   0.406930870102969   1 0
##   0.407367411331046   1 0
##   0.410323205245115   1 0
##   0.410887628596008   1 0
##   0.410920707051903   0 1
##   0.412042057622803   1 0
##   0.412887212776093   1 0
##   0.413496224146352   1 0
##   0.414245478391386   1 0
##   0.414821471965163   1 0
##   0.415473854592995   1 0
##   0.416875345412264   0 1
##   0.419238665371427   1 0
##   0.419255039928502   1 0
##   0.420627093046961   0 1
##   0.421195322723025   1 0
##   0.422020777337719   1 0
##   0.422149070508823   1 0
##   0.422149409471495   1 0
##   0.422365105536298   1 0
##   0.42274640732446    1 0
##   0.422797234852801   1 0
##   0.423169355796188   0 1
##   0.423719767373282   1 0
##   0.42615184323952    0 1
##   0.428213937995636   1 0
##   0.42825065027039    0 1
##   0.428376254419059   1 0
##   0.428968799635076   1 0
##   0.429345651147181   1 0
##   0.429872111756648   1 0
##   0.430036252391742   1 0
##   0.430128749130536   1 0
##   0.430253634166065   1 0
##   0.431613340373944   1 0
##   0.431701942434857   1 0
##   0.431784725495121   0 1
##   0.433423043504953   1 0
##   0.433521791630166   1 0
##   0.433680267482722   1 0
##   0.434280074722268   1 0
##   0.438129422481719   1 0
##   0.438813042094056   0 1
##   0.439015878117242   1 0
##   0.439120465055285   1 0
##   0.439607064955897   1 0
##   0.439822780823287   1 0
##   0.440867036126801   1 0
##   0.441793341729828   1 0
##   0.441796925392016   1 0
##   0.441887582726868   1 0
##   0.442110667301276   0 1
##   0.443818915436513   1 0
##   0.446275485824289   1 0
##   0.44634439612203    0 1
##   0.447674112230199   1 0
##   0.448176808686014   1 0
##   0.448741995022144   0 1
##   0.450197511096382   1 0
##   0.451088787434544   0 1
##   0.452495093860395   0 1
##   0.452795442429405   0 1
##   0.453399271460171   1 0
##   0.455337872637112   1 0
##   0.455726221201914   1 0
##   0.456687329727596   1 0
##   0.457223623202387   1 0
##   0.457311304142196   1 0
##   0.457915886084592   1 0
##   0.459819731743332   1 0
##   0.460073075354626   0 1
##   0.460105605684118   0 1
##   0.460211334233738   1 0
##   0.461741319440648   1 0
##   0.461749625950689   1 0
##   0.462783942821188   1 0
##   0.463003729804445   1 0
##   0.463756135933011   1 0
##   0.464217209924696   1 0
##   0.46445327748649    0 1
##   0.46457434559085    1 0
##   0.465104253064032   1 0
##   0.466694100752195   1 0
##   0.467723748923528   1 0
##   0.468623825105371   1 0
##   0.468806796626502   1 0
##   0.469133319812629   1 0
##   0.469286296752652   1 0
##   0.469592745198354   0 1
##   0.470339551952485   0 1
##   0.470368440834358   1 0
##   0.470424835322601   1 0
##   0.470816387084472   1 0
##   0.471012010355167   0 1
##   0.471130138843972   0 1
##   0.472302504497337   1 0
##   0.473034235413402   1 0
##   0.475601075460627   1 0
##   0.476204941993656   0 1
##   0.476206825225805   1 0
##   0.477101321562338   1 0
##   0.477278530930979   0 1
##   0.477979185669007   1 0
##   0.479547341979001   0 1
##   0.479814348038311   1 0
##   0.482510672498156   1 0
##   0.482714542159448   1 0
##   0.483755923099047   0 1
##   0.484289984082239   1 0
##   0.484477702602263   0 1
##   0.484985171074602   0 1
##   0.486961882587304   1 0
##   0.487344275672062   1 0
##   0.487610160087973   1 0
##   0.487750636740349   0 1
##   0.488273254528888   1 0
##   0.488659473826613   1 0
##   0.488784324304396   1 0
##   0.488964683136849   0 1
##   0.489593650944584   1 0
##   0.489723029637173   1 0
##   0.490096539949312   1 0
##   0.490268934106602   0 1
##   0.490375751827249   0 1
##   0.490762438906722   0 1
##   0.49131103634468    0 1
##   0.491702315824008   1 0
##   0.492040265019673   1 0
##   0.492618355916417   1 0
##   0.49635976365194    1 0
##   0.496429155478627   0 1
##   0.496695740422126   1 0
##   0.498087040732063   1 0
##   0.49832712546351    1 0
##   0.499378665121249   0 1
##   0.500419660240988   0 1
##   0.500950169599502   1 0
##   0.501212142887203   1 0
##   0.501256289495256   1 0
##   0.501917324598976   0 1
##   0.502434846641889   1 0
##   0.50267209772549    0 1
##   0.50286577830471    1 0
##   0.504257098097676   1 0
##   0.504431911828137   0 1
##   0.50497823456347    1 0
##   0.505892503301019   0 1
##   0.506186194111033   1 0
##   0.508096697519754   0 1
##   0.508462718345672   1 0
##   0.508608543917934   1 0
##   0.508887224105632   1 0
##   0.510574481402952   1 0
##   0.510850901721834   1 0
##   0.510932466986583   0 1
##   0.513277768520401   1 0
##   0.513315655121969   1 0
##   0.514076949121822   1 0
##   0.514519733683962   0 1
##   0.514916268463877   1 0
##   0.516312666243978   0 1
##   0.517080002000114   1 0
##   0.517759502073516   1 0
##   0.518134768659581   0 1
##   0.519435783564343   1 0
##   0.520784418440026   1 0
##   0.520878683523988   0 1
##   0.524753512518767   0 1
##   0.525144066740988   1 0
##   0.525190534849328   1 0
##   0.525566050594483   1 0
##   0.527156497622855   0 1
##   0.527795811474437   1 0
##   0.528008114670806   1 0
##   0.528449943589613   1 0
##   0.528500497401631   1 0
##   0.528515888820254   1 0
##   0.528902714645297   0 1
##   0.529429430001942   1 0
##   0.529634676985593   1 0
##   0.530028291762907   1 0
##   0.530142395987312   1 0
##   0.530211037211039   1 0
##   0.531670872310342   1 0
##   0.531898950671349   1 0
##   0.531992187164982   0 1
##   0.532327564121351   1 0
##   0.533222075285302   1 0
##   0.53464760468946    1 0
##   0.535047816914481   1 0
##   0.535314452467959   0 1
##   0.535323135068032   1 0
##   0.537123396634379   1 0
##   0.538204718917406   0 1
##   0.538713940809476   1 0
##   0.538847849166937   1 0
##   0.5390741294796     0 1
##   0.539515955009884   1 0
##   0.539975859754876   1 0
##   0.541744880750622   0 1
##   0.54189921301246    1 0
##   0.542719426017095   1 0
##   0.543178674933991   0 1
##   0.543690576514186   0 1
##   0.546225398563547   1 0
##   0.546252525633167   1 0
##   0.546874016600216   0 1
##   0.547949223981689   1 0
##   0.548278071644475   1 0
##   0.548488868924132   1 0
##   0.54849228522253    1 0
##   0.548730645196568   1 0
##   0.549010203417493   0 1
##   0.549287364926749   0 1
##   0.549586650843288   1 0
##   0.550312182054141   1 0
##   0.55064787750908    0 1
##   0.551247311572107   0 1
##   0.551305722409836   1 0
##   0.551944652030738   1 0
##   0.552533049045528   1 0
##   0.552881039661895   1 0
##   0.552908657941821   1 0
##   0.553625452317338   0 1
##   0.55576345267711    0 1
##   0.556458911772062   1 0
##   0.556962478082499   1 0
##   0.557605221120968   1 0
##   0.557685546966057   1 0
##   0.557711094989415   1 0
##   0.557877388193747   0 1
##   0.558228905368053   0 1
##   0.558359924302736   1 0
##   0.558653598792847   1 0
##   0.559780132480761   1 0
##   0.560591920593217   1 0
##   0.560989897573685   1 0
##   0.562589166867787   1 0
##   0.563062657330384   1 0
##   0.563397929749713   1 0
##   0.563731024001809   1 0
##   0.56438635979614    0 1
##   0.565518813606347   1 0
##   0.56589146686677    1 0
##   0.565901196150241   1 0
##   0.566686239213433   1 0
##   0.566762148995548   0 1
##   0.567237518205211   1 0
##   0.568576682138154   0 1
##   0.568795819293636   0 1
##   0.568932762994396   0 1
##   0.569235261420608   1 0
##   0.569243419472267   1 0
##   0.569488494323784   0 1
##   0.569926280494274   1 0
##   0.570193481185538   1 0
##   0.571101199343813   0 1
##   0.571558797441727   1 0
##   0.571969225541801   1 0
##   0.572692434741786   1 0
##   0.574152035791424   1 0
##   0.574232395035093   1 0
##   0.574779321508966   1 0
##   0.575221905164766   0 1
##   0.57646431878613    1 0
##   0.578847776529029   0 1
##   0.58041613694364    0 1
##   0.58050912012483    1 0
##   0.58102085803411    0 1
##   0.581273761573977   1 0
##   0.581544927379399   1 0
##   0.582314044991651   0 1
##   0.583232558705593   1 0
##   0.58354975274355    1 0
##   0.583923787136826   0 1
##   0.584216247778015   1 0
##   0.584337441994493   0 1
##   0.584505630874886   1 0
##   0.58489928100311    0 1
##   0.584931841083982   0 1
##   0.585267624289631   1 0
##   0.586812320689878   1 0
##   0.586899965783343   0 1
##   0.586927023119      1 0
##   0.587510063724312   1 0
##   0.587987710839386   1 0
##   0.588326274428271   0 1
##   0.588781632552136   1 0
##   0.588876907292372   0 1
##   0.590182437063148   1 0
##   0.590607812106766   0 1
##   0.590623798057217   0 1
##   0.59158786557409    1 0
##   0.591826286389282   1 0
##   0.592172085342124   1 0
##   0.592396390011418   1 0
##   0.592433456217732   1 0
##   0.592528559173083   1 0
##   0.593283057942301   1 0
##   0.596823645332909   1 0
##   0.598322502826544   1 0
##   0.598680657077432   0 1
##   0.59955181356085    0 1
##   0.599757029992375   0 1
##   0.599785422169552   0 1
##   0.600985798913122   1 0
##   0.601141388290588   1 0
##   0.601229754750464   0 1
##   0.601994709956427   1 0
##   0.601998767424992   0 1
##   0.602133170264999   1 0
##   0.6023534500953     0 1
##   0.602362914215849   1 0
##   0.602703151271055   0 1
##   0.603178886905618   0 1
##   0.603430486230654   1 0
##   0.603928934967597   0 1
##   0.604236601436136   1 0
##   0.604738719672253   0 1
##   0.605701000581571   0 1
##   0.606387613671947   1 0
##   0.60688848527649    1 0
##   0.606964168939944   1 0
##   0.606995044242927   1 0
##   0.607087868033402   1 0
##   0.607639614407147   1 0
##   0.608009156117607   1 0
##   0.608277251663234   1 0
##   0.608684218687528   1 0
##   0.609158749102498   1 0
##   0.609567334557054   1 0
##   0.611269110315609   1 0
##   0.612622988244184   0 1
##   0.613478287148133   1 0
##   0.613551015655747   1 0
##   0.613622866520342   0 1
##   0.613725498005561   0 1
##   0.61535285917913    1 0
##   0.615402239109532   1 0
##   0.615440589409089   1 0
##   0.615917833330515   1 0
##   0.615968340014304   1 0
##   0.618950165486647   1 0
##   0.618979506149804   1 0
##   0.61967870452284    1 0
##   0.620794467431576   1 0
##   0.621682768088602   1 0
##   0.62205541296962    1 0
##   0.622276415021685   1 0
##   0.622310962526644   1 0
##   0.623450917988779   0 1
##   0.624307056452976   0 1
##   0.624699072053876   1 0
##   0.625711510450633   1 0
##   0.626555064234954   0 1
##   0.626802900847126   1 0
##   0.627152150186895   1 0
##   0.62852145675115    1 0
##   0.629203691896012   1 0
##   0.62970686601977    1 0
##   0.631259589588269   0 1
##   0.631758393276684   1 0
##   0.634275966865673   1 0
##   0.634309424945935   0 1
##   0.634344683065826   0 1
##   0.634417736704686   0 1
##   0.634854256333959   1 0
##   0.636513694875166   1 0
##   0.637600721926942   1 0
##   0.637604895706197   1 0
##   0.637983387720708   0 1
##   0.63830824175079    0 1
##   0.638462246970663   0 1
##   0.638548337139545   0 1
##   0.638728252565497   1 0
##   0.639259378847671   1 0
##   0.639649440303558   0 1
##   0.639810560597316   1 0
##   0.640325228358217   1 0
##   0.640414086895202   1 0
##   0.641375482679218   1 0
##   0.642038414115923   0 1
##   0.642230027658894   0 1
##   0.643168707746336   0 1
##   0.643408976625519   1 0
##   0.643659010942545   1 0
##   0.643684091270932   1 0
##   0.643785305362098   1 0
##   0.643790426427932   0 1
##   0.645590899810703   0 1
##   0.646289995978072   1 0
##   0.646711520826802   1 0
##   0.647075801774749   0 1
##   0.647988724888187   1 0
##   0.648256765990097   1 0
##   0.648461601775477   1 0
##   0.649120643879221   0 1
##   0.650052490842837   0 1
##   0.651280461911686   1 0
##   0.652454602528195   1 0
##   0.652811774037374   1 0
##   0.653214789703546   1 0
##   0.653219857672822   0 1
##   0.653920445808721   1 0
##   0.65466145682858    1 0
##   0.654675625661254   1 0
##   0.654951413709482   0 1
##   0.65499266822625    0 1
##   0.656995016341203   1 0
##   0.657088476417159   1 0
##   0.657800255066694   1 0
##   0.658578162556833   0 1
##   0.659553369175286   1 0
##   0.65959383078675    0 1
##   0.659977402503663   1 0
##   0.662028885464287   0 1
##   0.662724761767737   0 1
##   0.662961624724333   1 0
##   0.663869044488094   1 0
##   0.664335445136627   1 0
##   0.664446899363771   1 0
##   0.665082389911134   1 0
##   0.666287429439789   0 1
##   0.667948567297696   1 0
##   0.667960249265964   0 1
##   0.668421507905887   0 1
##   0.669385553466906   1 0
##   0.669786367809609   1 0
##   0.669931000334668   0 1
##   0.669991228618573   1 0
##   0.670739856813165   0 1
##   0.671219608798307   1 0
##   0.672092408419122   0 1
##   0.672867348893514   1 0
##   0.672980500654754   1 0
##   0.674721288443724   1 0
##   0.676022702557651   1 0
##   0.676250635579681   0 1
##   0.676661275989568   0 1
##   0.676990872162198   1 0
##   0.677031586125032   0 1
##   0.679662476431629   1 0
##   0.680338901640047   0 1
##   0.681629622637077   1 0
##   0.682194540172046   0 1
##   0.682516972327311   1 0
##   0.682749889885191   0 1
##   0.683580861692516   0 1
##   0.684260412641142   0 1
##   0.684517967875469   0 1
##   0.685503857452336   0 1
##   0.68605219927082    1 0
##   0.6860984051314     0 1
##   0.686999263577751   1 0
##   0.687878723060082   1 0
##   0.68806918252924    1 0
##   0.689615028296617   1 0
##   0.690384999707634   1 0
##   0.69158853377438    1 0
##   0.692419870824545   1 0
##   0.692724853922909   0 1
##   0.693060486230937   0 1
##   0.693724954286305   0 1
##   0.693729475604085   0 1
##   0.694048083673627   0 1
##   0.694118606686797   1 0
##   0.694501589684159   0 1
##   0.69871757917987    1 0
##   0.699796885877274   1 0
##   0.701529216342394   0 1
##   0.701550293853299   1 0
##   0.703349165618733   1 0
##   0.70399982885662    1 0
##   0.704497013717196   0 1
##   0.705037377302299   0 1
##   0.705663664356192   0 1
##   0.705735557832614   1 0
##   0.705929430997283   0 1
##   0.707659116215231   1 0
##   0.707788919331623   1 0
##   0.708372247690298   0 1
##   0.708563008201093   0 1
##   0.70905928828057    0 1
##   0.709061454225404   1 0
##   0.709618360042562   0 1
##   0.709787264057604   1 0
##   0.710206075448248   1 0
##   0.711710334755299   1 0
##   0.711970018598894   0 1
##   0.712702219929812   1 0
##   0.7127031058325     0 1
##   0.713325072996078   0 1
##   0.713376410875555   0 1
##   0.714261898959203   0 1
##   0.714441679881056   1 0
##   0.715520568498245   1 0
##   0.715835275287205   1 0
##   0.716089588786221   1 0
##   0.716601444818923   0 1
##   0.716722023835293   1 0
##   0.716992201709452   0 1
##   0.717462367526912   0 1
##   0.71803252156119    0 1
##   0.718524009897994   1 0
##   0.718786211669154   1 0
##   0.718846302554928   1 0
##   0.718925431948119   0 1
##   0.719269416427672   1 0
##   0.719326193138541   1 0
##   0.719517576368584   0 1
##   0.720866649591591   1 0
##   0.721642243124387   1 0
##   0.722600077589005   0 1
##   0.722753794372245   0 1
##   0.723360814436816   1 0
##   0.726011720122253   1 0
##   0.726043032182867   0 1
##   0.727381557213931   1 0
##   0.728599808430716   1 0
##   0.728931637330812   1 0
##   0.729478546257191   1 0
##   0.72956685841558    0 1
##   0.730770884070179   0 1
##   0.73083780176708    1 0
##   0.730984590088748   1 0
##   0.7312000611466     1 0
##   0.732779937276348   1 0
##   0.733086364039169   0 1
##   0.733301531761389   0 1
##   0.734781416780693   0 1
##   0.734786388258894   0 1
##   0.735697757329262   1 0
##   0.736140318055643   1 0
##   0.737623969833311   1 0
##   0.73767700176984    1 0
##   0.737832787308106   0 1
##   0.738478160612295   0 1
##   0.739730970377428   0 1
##   0.740553745516518   1 0
##   0.741829639559694   1 0
##   0.74197935947004    1 0
##   0.742572715452252   1 0
##   0.743166626737382   1 0
##   0.74320292087557    0 1
##   0.744712377921083   1 0
##   0.746190621292906   1 0
##   0.746304303791773   0 1
##   0.746488567514726   0 1
##   0.746859559293108   1 0
##   0.746896740350162   0 1
##   0.747705023282044   0 1
##   0.748094924045417   1 0
##   0.749009041859389   1 0
##   0.749118405745802   0 1
##   0.749597832608207   1 0
##   0.750623484471528   1 0
##   0.750888970214415   1 0
##   0.751104100117493   0 1
##   0.751473869834674   1 0
##   0.751561638017689   1 0
##   0.75268416165779    0 1
##   0.753828173675552   1 0
##   0.75394648064347    0 1
##   0.754080421501773   0 1
##   0.754716168604224   1 0
##   0.755001673802444   1 0
##   0.759802280585808   0 1
##   0.760578569967397   1 0
##   0.762007152291047   0 1
##   0.763122509898983   1 0
##   0.764491523529838   0 1
##   0.765485436609298   0 1
##   0.765934095589287   1 0
##   0.766989485440316   1 0
##   0.767767798596775   1 0
##   0.768039709960745   1 0
##   0.768636948790469   1 0
##   0.768661975714852   1 0
##   0.770146491857533   1 0
##   0.77117679284961    1 0
##   0.771256207947403   1 0
##   0.772874095119971   1 0
##   0.773617384484898   1 0
##   0.773809729668625   0 1
##   0.776539301044993   0 1
##   0.782723184178396   0 1
##   0.783390984196399   1 0
##   0.784322328650208   1 0
##   0.785144035562044   1 0
##   0.786653629968662   1 0
##   0.788370432670457   0 1
##   0.789952980327922   0 1
##   0.7915549039289     0 1
##   0.79172617649201    1 0
##   0.792945291916026   0 1
##   0.793405820815072   0 1
##   0.796599513844252   0 1
##   0.798429638947454   0 1
##   0.798476129793376   0 1
##   0.798655188661771   1 0
##   0.800104152982279   0 1
##   0.800904878172075   0 1
##   0.801458422325427   1 0
##   0.803900203140119   0 1
##   0.804986074216816   0 1
##   0.805420460319363   1 0
##   0.805697209269249   1 0
##   0.807289178626391   1 0
##   0.808383456489813   1 0
##   0.809736424722833   0 1
##   0.811063555022705   1 0
##   0.811395593110491   1 0
##   0.811911241270454   0 1
##   0.812288856011246   0 1
##   0.812763292353023   0 1
##   0.812813716488329   0 1
##   0.812830463252264   1 0
##   0.813929721910701   0 1
##   0.814618153965066   1 0
##   0.81504337029336    0 1
##   0.816347350968434   0 1
##   0.817342870848315   1 0
##   0.817903990864042   1 0
##   0.818844788837769   0 1
##   0.819714616131969   0 1
##   0.820424281372452   1 0
##   0.820561784531284   0 1
##   0.821127396636321   1 0
##   0.821282864240531   0 1
##   0.821973386154589   1 0
##   0.822287500048969   1 0
##   0.822741152893633   0 1
##   0.823040712211361   0 1
##   0.823270066540222   1 0
##   0.824532785413211   1 0
##   0.825249243021744   1 0
##   0.825322538943171   1 0
##   0.826029082903366   0 1
##   0.826443914455406   0 1
##   0.827352152312165   1 0
##   0.828605254092941   1 0
##   0.828865973192655   0 1
##   0.828924231192874   0 1
##   0.828970202918867   0 1
##   0.830346237373529   0 1
##   0.831644998320285   1 0
##   0.832025760569464   0 1
##   0.832318554274813   0 1
##   0.832343747659832   0 1
##   0.832575198129651   1 0
##   0.832915437772652   1 0
##   0.833416925049428   1 0
##   0.833462170180602   1 0
##   0.8349319360766     1 0
##   0.835509631712571   0 1
##   0.836247083385393   1 0
##   0.836534737416809   0 1
##   0.838378628693095   1 0
##   0.839328733449032   0 1
##   0.839333226967658   1 0
##   0.839853801366865   1 0
##   0.842091593632251   0 1
##   0.843156016654035   0 1
##   0.843517706293874   1 0
##   0.844126350789613   1 0
##   0.844927712796785   0 1
##   0.845218953878019   1 0
##   0.845953282098144   1 0
##   0.846415525107876   1 0
##   0.846635629288822   0 1
##   0.846874018363778   1 0
##   0.846898930060729   0 1
##   0.848946594187633   1 0
##   0.849132255622299   1 0
##   0.849607023688078   0 1
##   0.851948863502721   1 0
##   0.852077498920868   0 1
##   0.852417009022745   1 0
##   0.853159369484578   0 1
##   0.853810896144594   0 1
##   0.854010443072511   1 0
##   0.855815233465928   0 1
##   0.857117888345884   0 1
##   0.857655125273416   0 1
##   0.859280272713291   1 0
##   0.860288265630249   0 1
##   0.860557809413042   0 1
##   0.862187820029904   0 1
##   0.862622257203269   0 1
##   0.86309262528379    0 1
##   0.863739612735006   0 1
##   0.863892317276536   0 1
##   0.864593821192564   0 1
##   0.865073608271568   1 0
##   0.865166348133159   1 0
##   0.866467683719451   0 1
##   0.867602713228957   0 1
##   0.868373182748143   0 1
##   0.869106429082859   1 0
##   0.870314285471362   0 1
##   0.870746399356224   1 0
##   0.871683481169687   0 1
##   0.871910659307404   0 1
##   0.872086042352394   0 1
##   0.87360748688278    0 1
##   0.875374983603975   0 1
##   0.876419806179625   0 1
##   0.877412203930111   0 1
##   0.877996657400666   1 0
##   0.878093197233642   0 1
##   0.878150883295227   1 0
##   0.878352256077034   1 0
##   0.878582433283531   0 1
##   0.879472761843928   0 1
##   0.880563190853761   0 1
##   0.881115135678844   0 1
##   0.88141534140439    1 0
##   0.882053440402456   0 1
##   0.88664051546398    1 0
##   0.886903010129604   1 0
##   0.888546243843489   0 1
##   0.888888881922608   0 1
##   0.8900059697315     0 1
##   0.89013471919984    0 1
##   0.890326187803868   0 1
##   0.891038790442046   0 1
##   0.891087796051991   0 1
##   0.892118148488132   0 1
##   0.894423201056223   0 1
##   0.895172744386818   0 1
##   0.896777335454486   0 1
##   0.897254349361216   0 1
##   0.900500661856682   1 0
##   0.90157324646983    1 0
##   0.902411714878915   0 1
##   0.903229355732225   0 1
##   0.905639281672779   0 1
##   0.905933746528419   0 1
##   0.906860307447206   0 1
##   0.907130167089109   0 1
##   0.907550270371004   1 0
##   0.907702369863836   0 1
##   0.909879618370683   0 1
##   0.90999931334596    1 0
##   0.91030790038775    0 1
##   0.910846223360086   1 0
##   0.910924562982407   0 1
##   0.91460262687486    0 1
##   0.91541851446785    1 0
##   0.915843719576298   0 1
##   0.915908342586184   0 1
##   0.916705203477213   0 1
##   0.918710577057408   0 1
##   0.919300653857714   0 1
##   0.920216729963549   0 1
##   0.92176591573461    0 1
##   0.922822371951601   0 1
##   0.924354869726597   0 1
##   0.924784342872966   0 1
##   0.925346958626922   1 0
##   0.926063417504803   1 0
##   0.926333589638811   1 0
##   0.928149052096042   1 0
##   0.928364210084208   0 1
##   0.928636006222907   0 1
##   0.929094463665043   0 1
##   0.93048807596869    0 1
##   0.930766336439034   0 1
##   0.931237602550781   0 1
##   0.932605572887754   1 0
##   0.935049820546746   0 1
##   0.935650454686583   0 1
##   0.938005614961878   1 0
##   0.938669791017977   1 0
##   0.939009553268253   0 1
##   0.939018940832528   1 0
##   0.939156132018518   0 1
##   0.939542756576638   1 0
##   0.940698302296738   0 1
##   0.940761873837284   0 1
##   0.941472012670048   1 0
##   0.942632382423992   1 0
##   0.943279307173978   0 1
##   0.944076231775509   0 1
##   0.944708704207129   0 1
##   0.944716510825538   0 1
##   0.945165491536202   0 1
##   0.94703273977696    1 0
##   0.947489292968382   0 1
##   0.947859943220322   0 1
##   0.948092614312854   0 1
##   0.948287274285722   0 1
##   0.949847801840868   0 1
##   0.950452458359048   0 1
##   0.952361632042752   0 1
##   0.952384647933704   0 1
##   0.953209270713033   0 1
##   0.955342476893636   0 1
##   0.958847387300944   0 1
##   0.959760131987821   0 1
##   0.962458098681201   0 1
##   0.964953125418087   0 1
##   0.965697157500858   0 1
##   0.967044535526669   0 1
##   0.968078671817562   0 1
##   0.968447583368736   0 1
##   0.968828961452941   0 1
##   0.970572204397303   0 1
##   0.971413424095712   1 0
##   0.972078593558456   0 1
##   0.972642692356072   0 1
##   0.975263630040745   0 1
##   0.975851333980377   1 0
##   0.985541597092168   0 1 
##  Accuracy: 6e-04 | Precision: 0 | Sensitivity: 0 | Specificity: 8e-04 
##  F1 Score: NaN | AUC: 0.7927 | AIC: 9791.476

# Diagnostics
# Logistic Regression Assumption Check
# This chunk of code is from Statistical tools for high-throughput data analysis (STHDA)
predictors <- colnames(test)
test_copy <- test
test_copy <- test_copy %>%
  mutate(logit = log(pred3_prob/(1-pred3_prob))) %>%
  gather(key = "predictors", value = "predictor.value", -logit)

ggplot(test_copy, aes(logit, predictor.value)) +
  geom_point(size = 0.5, alpha = 0.5) +
  geom_smooth(method = "loess") +
  theme_bw() +
  facet_wrap(~predictors, scales = "free_y")

## `geom_smooth()` using formula = 'y ~ x'

# Deviance Residuals Plot
deviance_residuals <- sign(test_transformed$TARGET_FLAG - pred3_prob) * 
 sqrt(-2 * (test_transformed$TARGET_FLAG * log(pred3_prob) + 
 (1 - test_transformed$TARGET_FLAG) * log(1 - pred3_prob)))
plot(pred3_prob, deviance_residuals,
  xlab = "Fitted Probabilities",
  ylab = "Deviance Residuals",
  main = "Deviance Residuals vs. Fitted Probabilities")

# Unusual points check
# Calculate the leverage cutoff
# p = # of coeff
p <- length(coef(logit_model3))
# n = # of observations
n <- nrow(train_transformed_smote)
# Cut off point high leverage
high_leverage <- (2 * (p+1)) / n
print(high_leverage)

## [1] 0.007906783

plot(logit_model3, which = 4, id.n = 5)

qqnorm(residuals(logit_model3))

halfnorm(hatvalues(logit_model3))

# x <- predict(model1)
# y <- resid(model1)
# binnedplot(x,y)

# Binned residuals
binned_result <- binned_residuals(logit_model3)
binned_result

## Warning: Probably bad model fit. Only about 58% of the residuals are inside the error bounds.

plot(binned_residuals(logit_model3), show_dots = TRUE)

# Model 4 Evaluation
# used train_transformed_smote

pred4_prob <- predict(logit_model4, test_transformed, type = "response")
pred4_class <- ifelse(pred4_prob > 0.5, 1, 0)

cm4 <- table(Predicted = pred4_class, Actual = test_transformed$TARGET_FLAG)
acc4 <- sum(diag(cm4)) / sum(cm4)
prec4 <- cm4[2,2] / sum(cm4[2,])
sens4 <- cm4[2,2] / sum(cm4[,2])
spec4 <- cm4[1,1] / sum(cm4[,1])
f1_4 <- 2 * (prec4 * sens4) / (prec4 + sens4)
roc4 <- roc(test_transformed$TARGET_FLAG, pred4_prob, quiet = TRUE)
auc4 <- auc(roc4)

output4 <- paste("\n=== Model Selection and Evaluation ===\n\n",
                 "=== Model 4 Evaluation ===\n",
                 "Confusion Matrix:\n",
                 paste(capture.output(print(cm4)), collapse = "\n"), "\n",
                 "Accuracy:", round(acc4, 4), "| Precision:", round(prec4, 4), 
                 "| Sensitivity:", round(sens4, 4), "| Specificity:", round(spec4, 4), "\n",
                 "F1 Score:", round(f1_4, 4), "| AUC:", round(auc4, 4), 
                 "| AIC:", round(AIC(logit_model4), 4), "\n\n", sep = " ")
cat(output4)

## 
## === Model Selection and Evaluation ===
## 
##  === Model 4 Evaluation ===
##  Confusion Matrix:
##           Actual
## Predicted   0   1
##         0 831 114
##         1 371 316 
##  Accuracy: 0.7028 | Precision: 0.46 | Sensitivity: 0.7349 | Specificity: 0.6913 
##  F1 Score: 0.5658 | AUC: 0.7882 | AIC: 9961.226

# Diagnostics
# Logistic Regression Assumption Check
# This chunk of code is from Statistical tools for high-throughput data analysis (STHDA)
predictors <- colnames(test_transformed)
test_transformed_copy <- test_transformed
test_transformed_copy <- test_transformed_copy %>%
  mutate(logit = log(pred4_prob/(1-pred4_prob))) %>%
  gather(key = "predictors", value = "predictor.value", -logit)

ggplot(test_transformed_copy, aes(logit, predictor.value)) +
  geom_point(size = 0.5, alpha = 0.5) +
  geom_smooth(method = "loess") +
  theme_bw() +
  facet_wrap(~predictors, scales = "free_y")

## `geom_smooth()` using formula = 'y ~ x'

# Deviance Residuals Plot
deviance_residuals <- sign(test_transformed$TARGET_FLAG - pred4_prob) * 
 sqrt(-2 * (test_transformed$TARGET_FLAG * log(pred4_prob) + 
 (1 - test_transformed$TARGET_FLAG) * log(1 - pred4_prob)))
plot(pred4_prob, deviance_residuals,
  xlab = "Fitted Probabilities",
  ylab = "Deviance Residuals",
  main = "Deviance Residuals vs. Fitted Probabilities")

# Unusual points check
# Calculate the leverage cutoff
# p = # of coeff
p <- length(coef(logit_model4))
# n = # of observations
n <- nrow(train_standardized_smote)
# Cut off point high leverage
high_leverage <- (2 * (p+1)) / n
print(high_leverage)

## [1] 0.00769871

plot(logit_model4, which = 4, id.n = 5)

qqnorm(residuals(logit_model4))

halfnorm(hatvalues(logit_model4))

# x <- predict(model1)
# y <- resid(model1)
# binnedplot(x,y)

# Binned residuals
binned_result <- binned_residuals(logit_model4)
binned_result

## Warning: Probably bad model fit. Only about 60% of the residuals are inside the error bounds.

plot(binned_residuals(logit_model4), show_dots = TRUE)

# Model 4B Evaluation
# used train_transformed_smote

pred4b_prob <- predict(logit_model4b, test_transformed, type = "response")
pred4b_class <- ifelse(pred4b_prob > 0.5, 1, 0)

cm4b <- table(Predicted = pred4b_class, Actual = test_transformed$TARGET_FLAG)
acc4b <- sum(diag(cm4b)) / sum(cm4b)
prec4b <- cm4b[2,2] / sum(cm4b[2,])
sens4b <- cm4b[2,2] / sum(cm4b[,2])
spec4b <- cm4b[1,1] / sum(cm4b[,1])
f1_4b <- 2 * (prec4b * sens4b) / (prec4b + sens4b)
roc4b <- roc(test_transformed$TARGET_FLAG, pred4b_prob, quiet = TRUE)
auc4b <- auc(roc4b)

output4b <- paste("\n=== Model Selection and Evaluation ===\n\n",
                 "=== Model 4B Evaluation ===\n",
                 "Confusion Matrix:\n",
                 paste(capture.output(print(cm4b)), collapse = "\n"), "\n",
                 "Accuracy:", round(acc4b, 4), "| Precision:", round(prec4b, 4), 
                 "| Sensitivity:", round(sens4b, 4), "| Specificity:", round(spec4b, 4), "\n",
                 "F1 Score:", round(f1_4b, 4), "| AUC:", round(auc4b, 4), 
                 "| AIC:", round(AIC(logit_model4b), 4), "\n\n", sep = " ")
cat(output4b)

## 
## === Model Selection and Evaluation ===
## 
##  === Model 4B Evaluation ===
##  Confusion Matrix:
##           Actual
## Predicted   0   1
##         0 845 117
##         1 357 313 
##  Accuracy: 0.7096 | Precision: 0.4672 | Sensitivity: 0.7279 | Specificity: 0.703 
##  F1 Score: 0.5691 | AUC: 0.7892 | AIC: 9898.7207

# Diagnostics
# Logistic Regression Assumption Check
# This chunk of code is from Statistical tools for high-throughput data analysis (STHDA)
predictors <- colnames(test_transformed)
test_transformed_copy <- test_transformed
test_transformed_copy <- test_transformed_copy %>%
  mutate(logit = log(pred4b_prob/(1-pred4b_prob))) %>%
  gather(key = "predictors", value = "predictor.value", -logit)

ggplot(test_transformed_copy, aes(logit, predictor.value)) +
  geom_point(size = 0.5, alpha = 0.5) +
  geom_smooth(method = "loess") +
  theme_bw() +
  facet_wrap(~predictors, scales = "free_y")

## `geom_smooth()` using formula = 'y ~ x'

# Deviance Residuals Plot
deviance_residuals <- sign(test_transformed$TARGET_FLAG - pred4b_prob) * 
 sqrt(-2 * (test_transformed$TARGET_FLAG * log(pred4b_prob) + 
 (1 - test_transformed$TARGET_FLAG) * log(1 - pred4b_prob)))
plot(pred4b_prob, deviance_residuals,
  xlab = "Fitted Probabilities",
  ylab = "Deviance Residuals",
  main = "Deviance Residuals vs. Fitted Probabilities")

# Unusual points check
# Calculate the leverage cutoff
# p = # of coeff
p <- length(coef(logit_model4b))
# n = # of observations
n <- nrow(train_standardized_smote)
# Cut off point high leverage
high_leverage <- (2 * (p+1)) / n
print(high_leverage)

## [1] 0.00769871

plot(logit_model4b, which = 4, id.n = 5)

qqnorm(residuals(logit_model4b))

halfnorm(hatvalues(logit_model4b))

# Binned residuals
binned_result <- binned_residuals(logit_model4b)
binned_result

## Warning: Probably bad model fit. Only about 57% of the residuals are inside the error bounds.

plot(binned_residuals(logit_model4b), show_dots = TRUE)

# Create comparison table
comparison <- data.frame(
  Model = c("Model1", "Model1B", "Model2", "Model2B", "Model3", "Model4", "Model4B"),
  AIC = c(AIC(logit_model1), AIC(logit_model1b), AIC(logit_model2), AIC(logit_model2b), AIC(logit_model3), AIC(logit_model4), AIC(logit_model4b)),
  Accuracy = c(acc1, acc1b, acc2, acc2b, acc3, acc4, acc4b),
  Error_Rate = c(1-acc1, 1-acc1b, 1-acc2, 1-acc2b, 1-acc3, 1-acc4, 1-acc4b),
  Precision = c(prec1, prec1b, prec2, prec2b, prec3, prec4, prec4b),
  Sensitivity = c(sens1, sens1b, sens2, sens2b, sens3, sens4, sens4b),
  Specificity = c(spec1, spec1b, spec2, spec2b, spec3, spec4, spec4b),
  F1_Score = c(f1_1, f1_1b, f1_2, f1_2b, f1_3, f1_4, f1_4b),
  AUC = c(auc1, auc1b, auc2, auc2b, auc3, auc4, auc4b)
)

cat("\n=== Model Comparison ===\n")

## 
## === Model Comparison ===

print(comparison)

##     Model      AIC     Accuracy Error_Rate Precision Sensitivity  Specificity
## 1  Model1 5902.530 0.7898284314  0.2101716 0.6581818   0.4209302 0.9217970050
## 2 Model1B 9928.520 0.7022058824  0.2977941 0.4608939   0.7674419 0.6788685524
## 3  Model2 5894.410 0.7898284314  0.2101716 0.6617100   0.4139535 0.9242928453
## 4 Model2B 9921.385 0.7028186275  0.2971814 0.4616457   0.7697674 0.6788685524
## 5  Model3 9791.476 0.0006127451  0.9993873 0.0000000   0.0000000 0.0008319468
## 6  Model4 9961.226 0.7028186275  0.2971814 0.4599709   0.7348837 0.6913477537
## 7 Model4B 9898.721 0.7095588235  0.2904412 0.4671642   0.7279070 0.7029950083
##    F1_Score       AUC
## 1 0.5134752 0.8068432
## 2 0.5759162 0.8011821
## 3 0.5092990 0.8065782
## 4 0.5771578 0.8011609
## 5       NaN 0.7927311
## 6 0.5658013 0.7881515
## 7 0.5690909 0.7891576

# Plot ROC curves
plot(roc1, col = "blue", main = "ROC Curves Comparison")
plot(roc1b, col = "red", add = TRUE)
plot(roc2, col = "green", add = TRUE)
plot(roc2b, col = "purple", add = TRUE)
plot(roc3, col = "yellow", add = TRUE)
plot(roc4, col = "brown", add = TRUE)
plot(roc4b, col = "orange", add = TRUE)
legend("bottomright", 
       legend = paste0(c("Model1", "Model1B", "Model2", "Model2B", "Model3", "Model4", "Model4B"), " (AUC=", 
                      round(c(auc1, auc1b, auc2, auc2b, auc3, auc4, auc4b), 3), ")"),
       col = c("blue", "red", "green", "purple", "yellow", "brown", "orange"),
       lwd = 2)

# Select best model (Model 2)
output_text <- paste(
"\n=== SELECTED LOGISTIC REGRESSION MODEL: Model 2 ===\n",
"Reasons for selection:\n",
"1. Lowest AIC indicating good model fit\n",
"2. Highest accuracy and very high AUC, displaying good discriminatory power, only slightly lower than the other models\n",
"3. Only the important predictors are present\n"
)

cat(output_text)

## 
## === SELECTED LOGISTIC REGRESSION MODEL: Model 2 ===
##  Reasons for selection:
##  1. Lowest AIC indicating good model fit
##  2. Highest accuracy and very high AUC, displaying good discriminatory power, only slightly lower than the other models
##  3. Only the important predictors are present

final_logit_model <- logit_model2
summary(final_logit_model)

## 
## Call:
## glm(formula = TARGET_FLAG ~ BLUEBOOK + CAR_TYPE + CAR_USE + CLM_FREQ + 
##     EDUCATION + HOME_VAL + INCOME + JOB + KIDSDRIV + MSTATUS + 
##     MVR_PTS + OLDCLAIM + PARENT1 + REVOKED + TIF + TRAVTIME + 
##     URBANICITY + YOJ, family = binomial, data = train)
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -3.364e+00  3.079e-01 -10.923  < 2e-16 ***
## BLUEBOOK                      -2.252e-05  5.313e-06  -4.237 2.26e-05 ***
## CAR_TYPEPanel Truck            5.825e-01  1.692e-01   3.444 0.000573 ***
## CAR_TYPEPickup                 5.475e-01  1.117e-01   4.902 9.48e-07 ***
## CAR_TYPESports Car             1.022e+00  1.195e-01   8.554  < 2e-16 ***
## CAR_TYPESUV                    7.503e-01  9.578e-02   7.834 4.72e-15 ***
## CAR_TYPEVan                    7.318e-01  1.343e-01   5.449 5.06e-08 ***
## CAR_USEPrivate                -7.668e-01  9.741e-02  -7.872 3.49e-15 ***
## CLM_FREQ                       1.721e-01  3.194e-02   5.388 7.14e-08 ***
## EDUCATIONHigh School           4.149e-01  8.992e-02   4.615 3.94e-06 ***
## EDUCATIONMasters               3.847e-02  1.506e-01   0.255 0.798368    
## EDUCATIONPhD                   1.653e-01  1.927e-01   0.858 0.390795    
## HOME_VAL                      -1.407e-06  3.775e-07  -3.726 0.000194 ***
## INCOME                        -2.377e-06  1.189e-06  -1.999 0.045577 *  
## JOBBlue Collar                 3.778e-01  2.075e-01   1.820 0.068690 .  
## JOBClerical                    4.684e-01  2.199e-01   2.130 0.033146 *  
## JOBDoctor                     -2.869e-01  2.890e-01  -0.993 0.320799    
## JOBHome Maker                  2.911e-01  2.339e-01   1.245 0.213302    
## JOBLawyer                      1.576e-01  1.905e-01   0.827 0.408040    
## JOBManager                    -4.881e-01  1.922e-01  -2.540 0.011092 *  
## JOBProfessional                2.445e-01  1.997e-01   1.224 0.220846    
## JOBStudent                     2.912e-01  2.397e-01   1.215 0.224414    
## KIDSDRIV                       4.117e-01  6.267e-02   6.570 5.04e-11 ***
## MSTATUSYes                    -4.694e-01  9.018e-02  -5.205 1.94e-07 ***
## MVR_PTS                        1.126e-01  1.521e-02   7.405 1.31e-13 ***
## OLDCLAIM                      -1.125e-05  4.392e-06  -2.561 0.010440 *  
## PARENT1Yes                     5.434e-01  1.050e-01   5.174 2.29e-07 ***
## REVOKEDYes                     7.945e-01  1.025e-01   7.749 9.24e-15 ***
## TIF                           -5.630e-02  8.136e-03  -6.920 4.52e-12 ***
## TRAVTIME                       1.483e-02  2.103e-03   7.050 1.79e-12 ***
## URBANICITYHighly Urban/ Urban  2.451e+00  1.264e-01  19.400  < 2e-16 ***
## YOJ                           -1.577e-02  9.362e-03  -1.684 0.092166 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7535.7  on 6528  degrees of freedom
## Residual deviance: 5830.4  on 6497  degrees of freedom
## AIC: 5894.4
## 
## Number of Fisher Scoring iterations: 5

Linear Model Evaluation

evaluate_lm_model <- function(model, test_data, model_name) {

  preds <- predict(model, newdata = test_data)
  actual <- test_data$LOG_TARGET_AMT

  rmse <- sqrt(mean((preds - actual)^2))
  mae  <- mean(abs(preds - actual))
  r2   <- 1 - sum((preds - actual)^2) / sum((actual - mean(actual))^2)
  aic  <- AIC(model)

  output <- paste(
    "\n=== Model Selection and Evaluation ===\n\n",
    "=== ", model_name, " Evaluation ===\n",
    "RMSE:", round(rmse, 4), 
    "| MAE:", round(mae, 4),
    "| R²:", round(r2, 4), "\n",
    "AIC:", round(aic, 4), "\n\n",
    sep = " "
  )

  cat(output)
}

# === LM1 Evaluation ===
evaluate_lm_model(lm_model1, test_crash, "LM1: Basic Model")

## 
## === Model Selection and Evaluation ===
## 
##  ===  LM1: Basic Model  Evaluation ===
##  RMSE: 0.8332 | MAE: 0.5778 | R²: 0.0229 
##  AIC: 5200.2343

# === LM2 Evaluation ===
evaluate_lm_model(lm_model2, test_crash, "LM2: Full Model")

## 
## === Model Selection and Evaluation ===
## 
##  ===  LM2: Full Model  Evaluation ===
##  RMSE: 0.8321 | MAE: 0.5773 | R²: 0.0253 
##  AIC: 5212.0871

# === LM3 Evaluation ===
evaluate_lm_model(lm_model3, test_crash, "LM3: Log-Transformed")

## 
## === Model Selection and Evaluation ===
## 
##  ===  LM3: Log-Transformed  Evaluation ===
##  RMSE: 0.8332 | MAE: 0.5778 | R²: 0.0229 
##  AIC: 5200.2343

# === LM4 Evaluation ===
evaluate_lm_model(lm_model4, test_crash, "LM4: Log + New Features")

## 
## === Model Selection and Evaluation ===
## 
##  ===  LM4: Log + New Features  Evaluation ===
##  RMSE: 0.8321 | MAE: 0.5773 | R²: 0.0253 
##  AIC: 5212.0871

# === LM5 Evaluation ===
evaluate_lm_model(lm_model5, test_crash, "LM5: Stepwise AIC Model")

## 
## === Model Selection and Evaluation ===
## 
##  ===  LM5: Stepwise AIC Model  Evaluation ===
##  RMSE: 0.8353 | MAE: 0.5788 | R²: 0.0179 
##  AIC: 5184.7439

# === LM6 Evaluation ===
evaluate_lm_model(lm_model6, test_crash, "LM6: Interaction Model")

## 
## === Model Selection and Evaluation ===
## 
##  ===  LM6: Interaction Model  Evaluation ===
##  RMSE: 0.8326 | MAE: 0.5797 | R²: 0.0242 
##  AIC: 5211.109

# === LM7 Evaluation ===
evaluate_lm_model(lm_model7, test_crash, "LM7: Polynomial Model")

## 
## === Model Selection and Evaluation ===
## 
##  ===  LM7: Polynomial Model  Evaluation ===
##  RMSE: 0.8297 | MAE: 0.5738 | R²: 0.031 
##  AIC: 5194.9383

#Linear Model Diagnostics

lm_list <- list(
  "LM1: Basic Model" = lm_model1,
  "LM2: Full Model" = lm_model2,
  "LM3: Log-Transformed" = lm_model3,
  "LM4: Log + New Features" = lm_model4,
  "LM5: Stepwise AIC" = lm_model5,
  "LM6: Interaction" = lm_model6,
  "LM7: Polynomial" = lm_model7
)

for (name in names(lm_list)) {
  model <- lm_list[[name]]
  cat("\n=== Diagnostics for", name, "===\n")
  
  # Residuals vs Fitted
  plot(model, which = 1, main = paste("Residuals vs Fitted -", name))
  
  # Q-Q Plot
  plot(model, which = 2, main = paste("Normal Q-Q Plot -", name))
  
  # Scale-Location Plot
  plot(model, which = 3, main = paste("Scale-Location Plot -", name))
  
  # Residuals vs Leverage
  plot(model, which = 5, main = paste("Residuals vs Leverage -", name))
  
  # High leverage cutoff
  p <- length(coef(model))
  n <- nrow(train_crash)
  high_lev <- (2 * (p+1)) / n
  cat("High leverage threshold:", round(high_lev, 5), "\n")
  
  # Half-normal plot of leverage values
  halfnorm(hatvalues(model), main = paste("Half-Normal Plot of Leverage -", name))
}

## 
## === Diagnostics for LM1: Basic Model ===

## High leverage threshold: 0.01301

## 
## === Diagnostics for LM2: Full Model ===

## High leverage threshold: 0.01951

## 
## === Diagnostics for LM3: Log-Transformed ===

## High leverage threshold: 0.01301

## 
## === Diagnostics for LM4: Log + New Features ===

## High leverage threshold: 0.01951

## 
## === Diagnostics for LM5: Stepwise AIC ===

## High leverage threshold: 0.00464

## 
## === Diagnostics for LM6: Interaction ===

## High leverage threshold: 0.01951

## 
## === Diagnostics for LM7: Polynomial ===

## High leverage threshold: 0.01486

evaluate_lm_model <- function(model, test_data, model_name) {

  preds <- predict(model, newdata = test_data)
  actual <- test_data$LOG_TARGET_AMT

  rmse <- sqrt(mean((preds - actual)^2))
  mae  <- mean(abs(preds - actual))
  r2   <- 1 - sum((preds - actual)^2) / sum((actual - mean(actual))^2)
  aic  <- AIC(model)

  return(data.frame(
    Model = model_name,
    RMSE = round(rmse, 4),
    MAE = round(mae, 4),
    R_squared = round(r2, 4),
    AIC = round(aic, 4)
  ))
}

comparison <- rbind(
  evaluate_lm_model(lm_model1, test_crash, "LM1: Basic Model"),
  evaluate_lm_model(lm_model2, test_crash, "LM2: Full Model"),
  evaluate_lm_model(lm_model3, test_crash, "LM3: Log-Transformed"),
  evaluate_lm_model(lm_model4, test_crash, "LM4: Log + New Features"),
  evaluate_lm_model(lm_model5, test_crash, "LM5: Stepwise AIC Model"),
  evaluate_lm_model(lm_model6, test_crash, "LM6: Interaction Model"),
  evaluate_lm_model(lm_model7, test_crash, "LM7: Polynomial Model")
)

print(comparison)

##                     Model   RMSE    MAE R_squared      AIC
## 1        LM1: Basic Model 0.8332 0.5778    0.0229 5200.234
## 2         LM2: Full Model 0.8321 0.5773    0.0253 5212.087
## 3    LM3: Log-Transformed 0.8332 0.5778    0.0229 5200.234
## 4 LM4: Log + New Features 0.8321 0.5773    0.0253 5212.087
## 5 LM5: Stepwise AIC Model 0.8353 0.5788    0.0179 5184.744
## 6  LM6: Interaction Model 0.8326 0.5797    0.0242 5211.109
## 7   LM7: Polynomial Model 0.8297 0.5738    0.0310 5194.938

kable(
  comparison,
  caption = "Multiple Linear Regression Models",
  digits = 4,
  align = "c"
)

Multiple Linear Regression Models
Model	RMSE	MAE	R_squared	AIC
LM1: Basic Model	0.8332	0.5778	0.0229	5200.234
LM2: Full Model	0.8321	0.5773	0.0253	5212.087
LM3: Log-Transformed	0.8332	0.5778	0.0229	5200.234
LM4: Log + New Features	0.8321	0.5773	0.0253	5212.087
LM5: Stepwise AIC Model	0.8353	0.5788	0.0179	5184.744
LM6: Interaction Model	0.8326	0.5797	0.0242	5211.109
LM7: Polynomial Model	0.8297	0.5738	0.0310	5194.938

6 Predictions on evaluated data

data_eval <- read.csv("https://raw.githubusercontent.com/gillianmcgovern0/cuny-data-621/refs/heads/main/insurance-evaluation-data.csv")

# No missing data
data_eval %>%
  summarise_all(~ sum(is.na(.))) %>%
  pivot_longer(cols = everything(), names_to = "variable", values_to = "missing_count") %>%
  filter(missing_count != 0) %>%
  arrange(desc(missing_count))

## # A tibble: 5 × 2
##   variable    missing_count
##   <chr>               <int>
## 1 TARGET_FLAG          2141
## 2 TARGET_AMT           2141
## 3 CAR_AGE               129
## 4 YOJ                    94
## 5 AGE                     1

# Check for duplicates
duplicates <- duplicated(data_eval)

# Print the duplicates
print(data_eval[duplicates, ])

##  [1] INDEX       TARGET_FLAG TARGET_AMT  KIDSDRIV    AGE         HOMEKIDS   
##  [7] YOJ         INCOME      PARENT1     HOME_VAL    MSTATUS     SEX        
## [13] EDUCATION   JOB         TRAVTIME    CAR_USE     BLUEBOOK    TIF        
## [19] CAR_TYPE    RED_CAR     OLDCLAIM    CLM_FREQ    REVOKED     MVR_PTS    
## [25] CAR_AGE     URBANICITY 
## <0 rows> (or 0-length row.names)

head(data_eval)

##   INDEX TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ  INCOME PARENT1
## 1     3          NA         NA        0  48        0  11 $52,881      No
## 2     9          NA         NA        1  40        1  11 $50,815     Yes
## 3    10          NA         NA        0  44        2  12 $43,486     Yes
## 4    18          NA         NA        0  35        2  NA $21,204     Yes
## 5    21          NA         NA        0  59        0  12 $87,460      No
## 6    30          NA         NA        0  46        0  14              No
##   HOME_VAL MSTATUS SEX     EDUCATION           JOB TRAVTIME    CAR_USE BLUEBOOK
## 1       $0    z_No   M     Bachelors       Manager       26    Private  $21,970
## 2       $0    z_No   M z_High School       Manager       21    Private  $18,930
## 3       $0    z_No z_F z_High School z_Blue Collar       30 Commercial   $5,900
## 4       $0    z_No   M z_High School      Clerical       74    Private   $9,230
## 5       $0    z_No   M z_High School       Manager       45    Private  $15,420
## 6 $207,519     Yes   M     Bachelors  Professional        7 Commercial  $25,660
##   TIF    CAR_TYPE RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS CAR_AGE
## 1   1         Van     yes       $0        0      No       2      10
## 2   6     Minivan      no   $3,295        1      No       2       1
## 3  10       z_SUV      no       $0        0      No       0      10
## 4   6      Pickup      no       $0        0     Yes       0       4
## 5   1     Minivan     yes  $44,857        2      No       4       1
## 6   1 Panel Truck      no   $2,119        1      No       2      12
##              URBANICITY
## 1   Highly Urban/ Urban
## 2   Highly Urban/ Urban
## 3 z_Highly Rural/ Rural
## 4 z_Highly Rural/ Rural
## 5   Highly Urban/ Urban
## 6   Highly Urban/ Urban

# Data clean up

# removing 'z_'
data_eval <- as.data.frame(lapply(data_eval, function(x) gsub("^z_", "", x)))


# Fix data format for amount variables , 
data_eval$INCOME = as.numeric(gsub("[$,]", "",data_eval$INCOME))
data_eval$HOME_VAL = as.numeric(gsub("[$,]", "",data_eval$HOME_VAL))
data_eval$BLUEBOOK = as.numeric(gsub("[$,]", "",data_eval$BLUEBOOK))
data_eval$OLDCLAIM = as.numeric(gsub("[$,]", "",data_eval$OLDCLAIM))

#CAR_AGE has a messy value -3, hence need to fix it.
data_eval$CAR_AGE =  ifelse(data_eval$CAR_AGE < 0, NA, data_eval$CAR_AGE)

data_eval$EDUCATION = gsub("<", "",data_eval$EDUCATION)

# Convert back to integer and numeric fields as these became characters when removing z_
data_eval$TARGET_FLAG = as.integer(data_eval$TARGET_FLAG)
data_eval$KIDSDRIV = as.integer(data_eval$KIDSDRIV)
data_eval$AGE = as.integer(data_eval$AGE)
data_eval$HOMEKIDS = as.integer(data_eval$HOMEKIDS)
data_eval$TRAVTIME = as.integer(data_eval$TRAVTIME)
data_eval$YOJ = as.integer(data_eval$YOJ)
data_eval$TIF = as.integer(data_eval$TIF)
data_eval$CLM_FREQ = as.integer(data_eval$CLM_FREQ)
data_eval$MVR_PTS = as.integer(data_eval$MVR_PTS)
data_eval$CAR_AGE = as.integer(data_eval$CAR_AGE)

data_eval$TARGET_AMT= as.numeric(data_eval$TARGET_AMT)


# Convert categorical variables to factors
  data_eval$EDUCATION <- factor(data_eval$EDUCATION)
  data_eval$JOB <- factor(data_eval$JOB)
  data_eval$CAR_TYPE <- factor(data_eval$CAR_TYPE)
  data_eval$CAR_USE <- factor(data_eval$CAR_USE)
  data_eval$MSTATUS <- factor(data_eval$MSTATUS)
  data_eval$PARENT1 <- factor(data_eval$PARENT1)
  data_eval$RED_CAR <- factor(data_eval$RED_CAR)
  data_eval$SEX <- factor(data_eval$SEX)
  data_eval$URBANICITY <- factor(data_eval$URBANICITY)
  data_eval$REVOKED <- factor(data_eval$REVOKED)

#the Index column had no impact on the target variable, it was dropped 
data_eval <- data_eval[ , -1]

head(data_eval)

##   TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ INCOME PARENT1 HOME_VAL
## 1          NA         NA        0  48        0  11  52881      No        0
## 2          NA         NA        1  40        1  11  50815     Yes        0
## 3          NA         NA        0  44        2  12  43486     Yes        0
## 4          NA         NA        0  35        2  NA  21204     Yes        0
## 5          NA         NA        0  59        0  12  87460      No        0
## 6          NA         NA        0  46        0  14     NA      No   207519
##   MSTATUS SEX   EDUCATION          JOB TRAVTIME    CAR_USE BLUEBOOK TIF
## 1      No   M   Bachelors      Manager       26    Private    21970   1
## 2      No   M High School      Manager       21    Private    18930   6
## 3      No   F High School  Blue Collar       30 Commercial     5900  10
## 4      No   M High School     Clerical       74    Private     9230   6
## 5      No   M High School      Manager       45    Private    15420   1
## 6     Yes   M   Bachelors Professional        7 Commercial    25660   1
##      CAR_TYPE RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS CAR_AGE
## 1         Van     yes        0        0      No       2      10
## 2     Minivan      no     3295        1      No       2       1
## 3         SUV      no        0        0      No       0      10
## 4      Pickup      no        0        0     Yes       0       4
## 5     Minivan     yes    44857        2      No       4       1
## 6 Panel Truck      no     2119        1      No       2      12
##            URBANICITY
## 1 Highly Urban/ Urban
## 2 Highly Urban/ Urban
## 3 Highly Rural/ Rural
## 4 Highly Rural/ Rural
## 5 Highly Urban/ Urban
## 6 Highly Urban/ Urban

# Fix missing values
data_eval <- data_eval %>% 
   mutate_at(vars(c("CAR_AGE", "YOJ", "AGE", "INCOME", "HOME_VAL")), ~ifelse(is.na(.), median(., na.rm = TRUE), .))

# Count of missing Values per Variable
missing_eval <- sapply(data_eval, function(x) sum(is.na(x)))
kable(data.frame(Variable = names(missing_eval), Missing = missing_eval), caption = "Missing Values in Each Variable")

Missing Values in Each Variable
	Variable	Missing
TARGET_FLAG	TARGET_FLAG	2141
TARGET_AMT	TARGET_AMT	2141
KIDSDRIV	KIDSDRIV	0
AGE	AGE	0
HOMEKIDS	HOMEKIDS	0
YOJ	YOJ	0
INCOME	INCOME	0
PARENT1	PARENT1	0
HOME_VAL	HOME_VAL	0
MSTATUS	MSTATUS	0
SEX	SEX	0
EDUCATION	EDUCATION	0
JOB	JOB	0
TRAVTIME	TRAVTIME	0
CAR_USE	CAR_USE	0
BLUEBOOK	BLUEBOOK	0
TIF	TIF	0
CAR_TYPE	CAR_TYPE	0
RED_CAR	RED_CAR	0
OLDCLAIM	OLDCLAIM	0
CLM_FREQ	CLM_FREQ	0
REVOKED	REVOKED	0
MVR_PTS	MVR_PTS	0
CAR_AGE	CAR_AGE	0
URBANICITY	URBANICITY	0

# Make binary classification predictions


eval_logit_predictions <- predict(final_logit_model, data_eval, type = "response")
eval_classifications <- ifelse(eval_logit_predictions > 0.5, 1, 0)
 
# # Create output dataframe
eval_output <- cbind(data_eval, 
Probability = round(eval_logit_predictions, 4),
Classification = eval_classifications
)

# Bar plot of the target variable (0 = low crime, 1 = high crime)
ggplot(eval_output, aes(x = factor(Classification))) +
  geom_bar(fill = "steelblue", color = "black") +
  labs(
    title = "Distribution of Target Variable",
    x = "Was in Car Crash? (0 = No, 1 = Yes)",
    y = "Count"
  ) +
  theme_minimal()

# # Save predictions
write.csv(eval_output, "crash_classification_predictions.csv", row.names = FALSE)

# Make multiple linear regression predictions

# For reference:
# lm_model7 <- lm(LOG_TARGET_AMT ~ SQRT_BLUEBOOK + I(SQRT_BLUEBOOK^2) +
#     CAR_AGE + I(CAR_AGE^2) +
#     LOG_INCOME + I(LOG_INCOME^2) +
#     CLM_FREQ + MVR_PTS +
#     CAR_TYPE + CAR_USE,
#   data = train_crash
# )

lm_eval_data <- eval_output
lm_eval_data$TARGET_FLAG <- eval_output$Classification

lm_eval_data_transform <- lm_eval_data %>%
  mutate(
    # LOG_TARGET_AMT = log(eval_output_transform$TARGET_AMT + 1), # this is missing
    LOG_OLDCLAIM = log(lm_eval_data$OLDCLAIM + 1),
    SQRT_BLUEBOOK = sqrt(lm_eval_data$BLUEBOOK),
    LOG_HOME_VAL = log(lm_eval_data$HOME_VAL + 1),
    LOG_INCOME = log(lm_eval_data$INCOME + 1),
    LOG_TRAVTIME = log(lm_eval_data$TRAVTIME + 1),
    LOG_TIF = log(lm_eval_data$TIF + 1),
    LOG_CAR_AGE = log(lm_eval_data$CAR_AGE + 1)
  )

lm_eval_data_transform_crash <- lm_eval_data_transform %>% filter(TARGET_FLAG == 1)


# Polynomial model
lm_predictions <- predict(lm_model7, newdata = lm_eval_data_transform_crash)


# Create output dataframe
lm_eval_output <- cbind(lm_eval_data_transform_crash, 
TARGET_AMT = round(exp(lm_predictions), 4)
)

# remove missing target col
lm_eval_output <- lm_eval_output[ , -2]

# # Save predictions
write.csv(lm_eval_output, "crash_lm_predictions.csv", row.names = FALSE)

DATA 621 - Assignment 4

Nakesha Fray, Dhanya Nair, Gillian McGovern

2025-11-09