DATA 621 – Business Analytics and Data Mining Homework #4 Assignment Requirements
Overview In this homework assignment, you will explore, analyze and model a data set containing approximately 8000 records representing a customer at an auto insurance company. Each record has two response variables. The first response variable, TARGET_FLAG, is a 1 or a 0. A “1” means that the person was in a car crash. A zero means that the person was not in a car crash. The second response variable is TARGET_AMT. This value is zero if the person did not crash their car. But if they did crash their car, this number will be a value greater than zero.
Your objective is to build multiple linear regression and binary logistic regression models on the training data to predict the probability that a person will crash their car and also the amount of money it will cost if the person does crash their car. You can only use the variables given to you (or variables that you derive from the variables provided).
library(tidyverse)
library(ggplot2)
library(mice)
library(car)
library(Hmisc)
library(corrplot)
library(pscl)
library(boot)
library(MASS)
# Load insurance csv
df_ins_raw <- read.csv("insurance_training_data.csv")
# Removing index as instructed
df_ins_raw <- subset(df_ins_raw, select = -c(INDEX))
# Preview data
glimpse(df_ins_raw)
## Rows: 8,161
## Columns: 25
## $ TARGET_FLAG <int> 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1…
## $ TARGET_AMT <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 2946.000, 0.000, 4021.0…
## $ KIDSDRIV <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ AGE <int> 60, 43, 35, 51, 50, 34, 54, 37, 34, 50, 53, 43, 55, 53, 45…
## $ HOMEKIDS <int> 0, 0, 1, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3, 2, 1…
## $ YOJ <int> 11, 11, 10, 14, NA, 12, NA, NA, 10, 7, 14, 5, 11, 11, 0, 1…
## $ INCOME <chr> "$67,349", "$91,449", "$16,039", "", "$114,986", "$125,301…
## $ PARENT1 <chr> "No", "No", "No", "No", "No", "Yes", "No", "No", "No", "No…
## $ HOME_VAL <chr> "$0", "$257,252", "$124,191", "$306,251", "$243,925", "$0"…
## $ MSTATUS <chr> "z_No", "z_No", "Yes", "Yes", "Yes", "z_No", "Yes", "Yes",…
## $ SEX <chr> "M", "M", "z_F", "M", "z_F", "z_F", "z_F", "M", "z_F", "M"…
## $ EDUCATION <chr> "PhD", "z_High School", "z_High School", "<High School", "…
## $ JOB <chr> "Professional", "z_Blue Collar", "Clerical", "z_Blue Colla…
## $ TRAVTIME <int> 14, 22, 5, 32, 36, 46, 33, 44, 34, 48, 15, 36, 25, 64, 48,…
## $ CAR_USE <chr> "Private", "Commercial", "Private", "Private", "Private", …
## $ BLUEBOOK <chr> "$14,230", "$14,940", "$4,010", "$15,440", "$18,000", "$17…
## $ TIF <int> 11, 1, 4, 7, 1, 1, 1, 1, 1, 7, 1, 7, 7, 6, 1, 6, 6, 7, 4, …
## $ CAR_TYPE <chr> "Minivan", "Minivan", "z_SUV", "Minivan", "z_SUV", "Sports…
## $ RED_CAR <chr> "yes", "yes", "no", "yes", "no", "no", "no", "yes", "no", …
## $ OLDCLAIM <chr> "$4,461", "$0", "$38,690", "$0", "$19,217", "$0", "$0", "$…
## $ CLM_FREQ <int> 2, 0, 2, 0, 2, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2…
## $ REVOKED <chr> "No", "No", "No", "No", "Yes", "No", "No", "Yes", "No", "N…
## $ MVR_PTS <int> 3, 0, 3, 0, 3, 0, 0, 10, 0, 1, 0, 0, 3, 3, 3, 0, 0, 0, 0, …
## $ CAR_AGE <int> 18, 1, 10, 6, 17, 7, 1, 7, 1, 17, 11, 1, 9, 10, 5, 13, 16,…
## $ URBANICITY <chr> "Highly Urban/ Urban", "Highly Urban/ Urban", "Highly Urba…
remove_z <- function(x){
str_replace(x, 'z_', '')
}
# Remove extraneous z_
df_ins_raw <- mutate_all(df_ins_raw, funs(remove_z))
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
remove_dollar <- function(x){
str_replace(x, '\\$', '')
}
# Remove dollar sign from variables
df_ins_raw <- mutate_all(df_ins_raw, funs(remove_dollar))
remove_comma <- function(x){
str_replace(x, ',', '')
}
# Remove commas from variables
df_ins_raw <- mutate_all(df_ins_raw, funs(remove_comma))
# Preview updated data
glimpse(df_ins_raw)
## Rows: 8,161
## Columns: 25
## $ TARGET_FLAG <chr> "0", "0", "0", "0", "0", "1", "0", "1", "1", "0", "1", "0"…
## $ TARGET_AMT <chr> "0", "0", "0", "0", "0", "2946", "0", "4021", "2501", "0",…
## $ KIDSDRIV <chr> "0", "0", "0", "0", "0", "0", "0", "1", "0", "0", "0", "0"…
## $ AGE <chr> "60", "43", "35", "51", "50", "34", "54", "37", "34", "50"…
## $ HOMEKIDS <chr> "0", "0", "1", "0", "0", "1", "0", "2", "0", "0", "0", "0"…
## $ YOJ <chr> "11", "11", "10", "14", NA, "12", NA, NA, "10", "7", "14",…
## $ INCOME <chr> "67349", "91449", "16039", "", "114986", "125301", "18755"…
## $ PARENT1 <chr> "No", "No", "No", "No", "No", "Yes", "No", "No", "No", "No…
## $ HOME_VAL <chr> "0", "257252", "124191", "306251", "243925", "0", "", "333…
## $ MSTATUS <chr> "No", "No", "Yes", "Yes", "Yes", "No", "Yes", "Yes", "No",…
## $ SEX <chr> "M", "M", "F", "M", "F", "F", "F", "M", "F", "M", "F", "F"…
## $ EDUCATION <chr> "PhD", "High School", "High School", "<High School", "PhD"…
## $ JOB <chr> "Professional", "Blue Collar", "Clerical", "Blue Collar", …
## $ TRAVTIME <chr> "14", "22", "5", "32", "36", "46", "33", "44", "34", "48",…
## $ CAR_USE <chr> "Private", "Commercial", "Private", "Private", "Private", …
## $ BLUEBOOK <chr> "14230", "14940", "4010", "15440", "18000", "17430", "8780…
## $ TIF <chr> "11", "1", "4", "7", "1", "1", "1", "1", "1", "7", "1", "7…
## $ CAR_TYPE <chr> "Minivan", "Minivan", "SUV", "Minivan", "SUV", "Sports Car…
## $ RED_CAR <chr> "yes", "yes", "no", "yes", "no", "no", "no", "yes", "no", …
## $ OLDCLAIM <chr> "4461", "0", "38690", "0", "19217", "0", "0", "2374", "0",…
## $ CLM_FREQ <chr> "2", "0", "2", "0", "2", "0", "0", "1", "0", "0", "0", "0"…
## $ REVOKED <chr> "No", "No", "No", "No", "Yes", "No", "No", "Yes", "No", "N…
## $ MVR_PTS <chr> "3", "0", "3", "0", "3", "0", "0", "10", "0", "1", "0", "0…
## $ CAR_AGE <chr> "18", "1", "10", "6", "17", "7", "1", "7", "1", "17", "11"…
## $ URBANICITY <chr> "Highly Urban/ Urban", "Highly Urban/ Urban", "Highly Urba…
# Count of distinct values for each column
df_ins_raw %>% summarise_all(n_distinct)
## TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ INCOME PARENT1 HOME_VAL
## 1 2 1949 5 61 6 22 6613 2 5107
## MSTATUS SEX EDUCATION JOB TRAVTIME CAR_USE BLUEBOOK TIF CAR_TYPE RED_CAR
## 1 2 2 5 9 97 2 2789 23 6 2
## OLDCLAIM CLM_FREQ REVOKED MVR_PTS CAR_AGE URBANICITY
## 1 2857 6 2 13 31 2
df_ins_raw %>% distinct(PARENT1)
## PARENT1
## 1 No
## 2 Yes
df_ins_raw %>% distinct(MSTATUS)
## MSTATUS
## 1 No
## 2 Yes
df_ins_raw %>% distinct(SEX)
## SEX
## 1 M
## 2 F
df_ins_raw %>% distinct(EDUCATION)
## EDUCATION
## 1 PhD
## 2 High School
## 3 <High School
## 4 Bachelors
## 5 Masters
df_ins_raw %>% distinct(JOB)
## JOB
## 1 Professional
## 2 Blue Collar
## 3 Clerical
## 4 Doctor
## 5 Lawyer
## 6 Manager
## 7
## 8 Home Maker
## 9 Student
df_ins_raw %>% distinct(CAR_USE)
## CAR_USE
## 1 Private
## 2 Commercial
df_ins_raw %>% distinct(CAR_TYPE)
## CAR_TYPE
## 1 Minivan
## 2 SUV
## 3 Sports Car
## 4 Van
## 5 Panel Truck
## 6 Pickup
df_ins_raw %>% distinct(CLM_FREQ)
## CLM_FREQ
## 1 2
## 2 0
## 3 1
## 4 3
## 5 5
## 6 4
df_ins_raw %>% distinct(REVOKED)
## REVOKED
## 1 No
## 2 Yes
df_ins_raw %>% distinct(URBANICITY)
## URBANICITY
## 1 Highly Urban/ Urban
## 2 Highly Rural/ Rural
# Set data types for variables
df_ins_clean <- df_ins_raw %>% transform(
TARGET_FLAG = as.factor(TARGET_FLAG),
TARGET_AMT = as.numeric(TARGET_AMT),
KIDSDRIV = as.factor(KIDSDRIV),
AGE = as.numeric(AGE),
HOMEKIDS = as.factor(HOMEKIDS),
YOJ = as.numeric(YOJ),
INCOME = as.numeric(INCOME),
PARENT1 = as.factor(PARENT1),
HOME_VAL = as.numeric(HOME_VAL),
MSTATUS = as.factor(MSTATUS),
SEX = as.factor(SEX),
EDUCATION = as.factor(EDUCATION),
JOB = as.factor(JOB),
TRAVTIME = as.numeric(TRAVTIME),
CAR_USE = as.factor(CAR_USE),
BLUEBOOK = as.numeric(BLUEBOOK),
TIF = as.numeric(TIF), # factor or numeric?
CAR_TYPE = as.factor(CAR_TYPE),
RED_CAR = as.factor(RED_CAR),
OLDCLAIM = as.numeric(OLDCLAIM),
CLM_FREQ = as.ordered(CLM_FREQ), # factor or numeric?
REVOKED = as.factor(REVOKED),
MVR_PTS = as.numeric(MVR_PTS),
CAR_AGE = as.numeric(CAR_AGE),
URBANICITY = as.factor(URBANICITY))
# Confirm CLM_FREQ is an ordered factor
is.ordered(df_ins_clean$CLM_FREQ)
## [1] TRUE
# NA counts for each column
colSums(is.na(df_ins_clean))
## TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ
## 0 0 0 6 0 454
## INCOME PARENT1 HOME_VAL MSTATUS SEX EDUCATION
## 445 0 464 0 0 0
## JOB TRAVTIME CAR_USE BLUEBOOK TIF CAR_TYPE
## 0 0 0 0 0 0
## RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS CAR_AGE
## 0 0 0 0 0 510
## URBANICITY
## 0
# Visualize NA counts for each column
df_ins_clean %>%
summarise_all(list(~is.na(.)))%>%
pivot_longer(everything(),
names_to = "variables", values_to="missing") %>%
count(variables, missing) %>%
ggplot(aes(y=variables,x=n,fill=missing))+
geom_col()
# Impute data by regression:
df_ins_imp <- mice(df_ins_clean, method = "norm.predict", m = 1, remove.collinear=FALSE)
##
## iter imp variable
## 1 1 AGE YOJ INCOME HOME_VAL CAR_AGE
## 2 1 AGE YOJ INCOME HOME_VAL CAR_AGE
## 3 1 AGE YOJ INCOME HOME_VAL CAR_AGE
## 4 1 AGE YOJ INCOME HOME_VAL CAR_AGE
## 5 1 AGE YOJ INCOME HOME_VAL CAR_AGE
df_ins_imp <- complete(df_ins_imp)
# Confirm no NAs remain
colSums(is.na(df_ins_imp))
## TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ
## 0 0 0 0 0 0
## INCOME PARENT1 HOME_VAL MSTATUS SEX EDUCATION
## 0 0 0 0 0 0
## JOB TRAVTIME CAR_USE BLUEBOOK TIF CAR_TYPE
## 0 0 0 0 0 0
## RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS CAR_AGE
## 0 0 0 0 0 0
## URBANICITY
## 0
describe(df_ins_imp)
## df_ins_imp
##
## 25 Variables 8161 Observations
## --------------------------------------------------------------------------------
## TARGET_FLAG
## n missing distinct
## 8161 0 2
##
## Value 0 1
## Frequency 6008 2153
## Proportion 0.736 0.264
## --------------------------------------------------------------------------------
## TARGET_AMT
## n missing distinct Info Mean Gmd .05 .10
## 8161 0 1949 0.601 1504 2574 0 0
## .25 .50 .75 .90 .95
## 0 0 1036 4904 6452
##
## lowest : 0.00000 30.27728 58.53106 95.56732 108.74150
## highest: 73783.46592 77907.43028 78874.19056 85523.65335 107586.13616
## --------------------------------------------------------------------------------
## KIDSDRIV
## n missing distinct
## 8161 0 5
##
## lowest : 0 1 2 3 4, highest: 0 1 2 3 4
##
## Value 0 1 2 3 4
## Frequency 7180 636 279 62 4
## Proportion 0.880 0.078 0.034 0.008 0.000
## --------------------------------------------------------------------------------
## AGE
## n missing distinct Info Mean Gmd .05 .10
## 8161 0 66 0.999 44.78 9.749 30 34
## .25 .50 .75 .90 .95
## 39 45 51 56 59
##
## lowest : 16 17 18 19 20, highest: 72 73 76 80 81
## --------------------------------------------------------------------------------
## HOMEKIDS
## n missing distinct
## 8161 0 6
##
## lowest : 0 1 2 3 4, highest: 1 2 3 4 5
##
## Value 0 1 2 3 4 5
## Frequency 5289 902 1118 674 164 14
## Proportion 0.648 0.111 0.137 0.083 0.020 0.002
## --------------------------------------------------------------------------------
## YOJ
## n missing distinct Info Mean Gmd .05 .10
## 8161 0 475 0.991 10.5 4.2 0 5
## .25 .50 .75 .90 .95
## 9 11 13 14 15
##
## lowest : 0.000000 1.000000 2.000000 2.748155 3.000000
## highest: 16.348253 17.000000 18.000000 19.000000 23.000000
## --------------------------------------------------------------------------------
## INCOME
## n missing distinct Info Mean Gmd .05 .10
## 8161 0 7057 1 61594 51009 0 4362
## .25 .50 .75 .90 .95
## 27940 54216 85472 122744 151663
##
## lowest : -31968.54 -26991.29 -20478.35 -16829.41 -16714.01
## highest: 306277.00 309628.00 320127.00 332339.00 367030.00
## --------------------------------------------------------------------------------
## PARENT1
## n missing distinct
## 8161 0 2
##
## Value No Yes
## Frequency 7084 1077
## Proportion 0.868 0.132
## --------------------------------------------------------------------------------
## HOME_VAL
## n missing distinct Info Mean Gmd .05 .10
## 8161 0 5570 0.978 154934 142397 0 0
## .25 .50 .75 .90 .95
## 0 160953 237604 314151 373031
##
## lowest : -86468.16 -71570.86 -71165.20 -70648.14 -68150.78
## highest: 657804.00 682634.00 738153.00 750455.00 885282.00
## --------------------------------------------------------------------------------
## MSTATUS
## n missing distinct
## 8161 0 2
##
## Value No Yes
## Frequency 3267 4894
## Proportion 0.4 0.6
## --------------------------------------------------------------------------------
## SEX
## n missing distinct
## 8161 0 2
##
## Value F M
## Frequency 4375 3786
## Proportion 0.536 0.464
## --------------------------------------------------------------------------------
## EDUCATION
## n missing distinct
## 8161 0 5
##
## lowest : <High School Bachelors High School Masters PhD
## highest: <High School Bachelors High School Masters PhD
##
## Value <High School Bachelors High School Masters PhD
## Frequency 1203 2242 2330 1658 728
## Proportion 0.147 0.275 0.286 0.203 0.089
## --------------------------------------------------------------------------------
## JOB
## n missing distinct
## 8161 0 9
##
## lowest : Blue Collar Clerical Doctor Home Maker
## highest: Home Maker Lawyer Manager Professional Student
##
## Value Blue Collar Clerical Doctor Home Maker
## Frequency 526 1825 1271 246 641
## Proportion 0.064 0.224 0.156 0.030 0.079
##
## Value Lawyer Manager Professional Student
## Frequency 835 988 1117 712
## Proportion 0.102 0.121 0.137 0.087
## --------------------------------------------------------------------------------
## TRAVTIME
## n missing distinct Info Mean Gmd .05 .10
## 8161 0 97 1 33.49 17.85 7 13
## .25 .50 .75 .90 .95
## 22 33 44 54 60
##
## lowest : 5 6 7 8 9, highest: 103 113 124 134 142
## --------------------------------------------------------------------------------
## CAR_USE
## n missing distinct
## 8161 0 2
##
## Value Commercial Private
## Frequency 3029 5132
## Proportion 0.371 0.629
## --------------------------------------------------------------------------------
## BLUEBOOK
## n missing distinct Info Mean Gmd .05 .10
## 8161 0 2789 1 15710 9354 4900 6000
## .25 .50 .75 .90 .95
## 9280 14440 20850 27460 31110
##
## lowest : 1500 1520 1530 1540 1590, highest: 57970 61050 62240 65970 69740
## --------------------------------------------------------------------------------
## TIF
## n missing distinct Info Mean Gmd .05 .10
## 8161 0 23 0.961 5.351 4.512 1 1
## .25 .50 .75 .90 .95
## 1 4 7 11 13
##
## lowest : 1 2 3 4 5, highest: 19 20 21 22 25
## --------------------------------------------------------------------------------
## CAR_TYPE
## n missing distinct
## 8161 0 6
##
## lowest : Minivan Panel Truck Pickup Sports Car SUV
## highest: Panel Truck Pickup Sports Car SUV Van
##
## Value Minivan Panel Truck Pickup Sports Car SUV
## Frequency 2145 676 1389 907 2294
## Proportion 0.263 0.083 0.170 0.111 0.281
##
## Value Van
## Frequency 750
## Proportion 0.092
## --------------------------------------------------------------------------------
## RED_CAR
## n missing distinct
## 8161 0 2
##
## Value no yes
## Frequency 5783 2378
## Proportion 0.709 0.291
## --------------------------------------------------------------------------------
## OLDCLAIM
## n missing distinct Info Mean Gmd .05 .10
## 8161 0 2857 0.769 4037 6563 0 0
## .25 .50 .75 .90 .95
## 0 0 4636 9583 27090
##
## lowest : 0 502 506 518 519, highest: 52507 53477 53568 53986 57037
## --------------------------------------------------------------------------------
## CLM_FREQ
## n missing distinct
## 8161 0 6
##
## lowest : 0 1 2 3 4, highest: 1 2 3 4 5
##
## Value 0 1 2 3 4 5
## Frequency 5009 997 1171 776 190 18
## Proportion 0.614 0.122 0.143 0.095 0.023 0.002
## --------------------------------------------------------------------------------
## REVOKED
## n missing distinct
## 8161 0 2
##
## Value No Yes
## Frequency 7161 1000
## Proportion 0.877 0.123
## --------------------------------------------------------------------------------
## MVR_PTS
## n missing distinct Info Mean Gmd .05 .10
## 8161 0 13 0.9 1.696 2.187 0 0
## .25 .50 .75 .90 .95
## 0 1 3 5 6
##
## lowest : 0 1 2 3 4, highest: 8 9 10 11 13
##
## Value 0 1 2 3 4 5 6 7 8 9 10
## Frequency 3712 1157 948 758 599 399 266 167 84 45 13
## Proportion 0.455 0.142 0.116 0.093 0.073 0.049 0.033 0.020 0.010 0.006 0.002
##
## Value 11 13
## Frequency 11 2
## Proportion 0.001 0.000
## --------------------------------------------------------------------------------
## CAR_AGE
## n missing distinct Info Mean Gmd .05 .10
## 8161 0 540 0.985 8.347 6.374 1.000 1.000
## .25 .50 .75 .90 .95
## 3.514 8.000 12.000 16.000 18.000
##
## lowest : -3.0000000 -1.1184480 -0.9423218 0.0000000 1.0000000
## highest: 24.0000000 25.0000000 26.0000000 27.0000000 28.0000000
## --------------------------------------------------------------------------------
## URBANICITY
## n missing distinct
## 8161 0 2
##
## Value Highly Rural/ Rural Highly Urban/ Urban
## Frequency 1669 6492
## Proportion 0.205 0.795
## --------------------------------------------------------------------------------
# Histograms
df_ins_imp %>%
keep(is.numeric) %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~ key, scales = "free") +
geom_density(fill = "steelblue", alpha=0.9, color="steelblue") +
geom_histogram(aes(y=..density..), alpha=0.5, fill = "lightblue", color="lightblue", position="identity")
# Boxplots
df_ins_imp %>%
keep(is.numeric) %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~ key, scales = "free") +
geom_boxplot(fill = "steelblue", color="black", outlier.colour="red", outlier.shape=16,
outlier.size=2, notch=FALSE)
# Log transformation
df_ins_imp_log <- df_ins_imp %>% keep(is.numeric)
df_ins_imp_log <- log(df_ins_imp_log + 1)
# Histograms of log transformed numeric variables
df_ins_imp_log %>%
gather(variable, value, TARGET_AMT:CAR_AGE) %>%
ggplot(., aes(value)) +
geom_density(fill = "steelblue", color="steelblue") +
facet_wrap(~variable, scales ="free", ncol = 4) +
labs(x = element_blank(), y = element_blank())
# Test for normality
shapiro.test(df_ins_imp_log$AGE[0:5000])
##
## Shapiro-Wilk normality test
##
## data: df_ins_imp_log$AGE[0:5000]
## W = 0.97949, p-value < 2.2e-16
# Visual inspection of one variable (age) for normality
qqnorm(df_ins_imp_log$AGE, pch = 1, frame = FALSE)
qqline(df_ins_imp_log$AGE, col = "steelblue", lwd = 2)
# Square root transformation
df_ins_imp_sqrt <- sqrt(df_ins_imp %>% keep(is.numeric))
# Histograms of square root transformed numeric variables
df_ins_imp_sqrt %>%
gather(variable, value, TARGET_AMT:CAR_AGE) %>%
ggplot(., aes(value)) +
geom_density(fill = "steelblue", color="steelblue") +
facet_wrap(~variable, scales ="free", ncol = 4) +
labs(x = element_blank(), y = element_blank())
# Test for normality
shapiro.test(df_ins_imp_sqrt$AGE[0:5000])
##
## Shapiro-Wilk normality test
##
## data: df_ins_imp_sqrt$AGE[0:5000]
## W = 0.99371, p-value = 5.047e-14
# Visual inspection of one variable (age) for normality
qqnorm(df_ins_imp_sqrt$AGE, pch = 1, frame = FALSE)
qqline(df_ins_imp_sqrt$AGE, col = "steelblue", lwd = 2)
# Cube root transformation
df_ins_imp_cube <- (df_ins_imp %>% keep(is.numeric))^(1/3)
# Histograms of cube root transformed numeric variables
df_ins_imp_cube %>%
gather(variable, value, TARGET_AMT:CAR_AGE) %>%
ggplot(., aes(value)) +
geom_density(fill = "steelblue", color="steelblue") +
facet_wrap(~variable, scales ="free", ncol = 4) +
labs(x = element_blank(), y = element_blank())
# Test for normality
shapiro.test(df_ins_imp_cube$AGE[0:5000])
##
## Shapiro-Wilk normality test
##
## data: df_ins_imp_cube$AGE[0:5000]
## W = 0.98989, p-value < 2.2e-16
# Visual inspection of one variable (age) for normality
qqnorm(df_ins_imp_cube$AGE, pch = 1, frame = FALSE)
qqline(df_ins_imp_cube$AGE, col = "steelblue", lwd = 2)
df_ins <- df_ins_imp %>%
mutate(across(c(TARGET_AMT, AGE, YOJ, INCOME, HOME_VAL, TRAVTIME, BLUEBOOK,
TIF, OLDCLAIM, MVR_PTS, CAR_AGE), .fns = list(log = ~ log(. + 1))))
df_ins <- df_ins %>%
mutate(across(c(TARGET_AMT, AGE, YOJ, INCOME, HOME_VAL, TRAVTIME, BLUEBOOK,
TIF, OLDCLAIM, MVR_PTS, CAR_AGE), .fns = list(sqrt = sqrt)))
df_ins <- df_ins %>%
mutate(across(c(TARGET_AMT, AGE, YOJ, INCOME, HOME_VAL, TRAVTIME, BLUEBOOK,
TIF, OLDCLAIM, MVR_PTS, CAR_AGE), .fns = list(cbrt = ~ .^(1/3))))
glimpse(df_ins)
## Rows: 8,161
## Columns: 58
## $ TARGET_FLAG <fct> 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, …
## $ TARGET_AMT <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 2946.000, 0.000, 40…
## $ KIDSDRIV <fct> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ AGE <dbl> 60, 43, 35, 51, 50, 34, 54, 37, 34, 50, 53, 43, 55, 53…
## $ HOMEKIDS <fct> 0, 0, 1, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3, …
## $ YOJ <dbl> 11.00000, 11.00000, 10.00000, 14.00000, 11.34200, 12.0…
## $ INCOME <dbl> 67349.00, 91449.00, 16039.00, 67185.86, 114986.00, 125…
## $ PARENT1 <fct> No, No, No, No, No, Yes, No, No, No, No, No, No, No, N…
## $ HOME_VAL <dbl> 0.0, 257252.0, 124191.0, 306251.0, 243925.0, 0.0, 1654…
## $ MSTATUS <fct> No, No, Yes, Yes, Yes, No, Yes, Yes, No, No, No, Yes, …
## $ SEX <fct> M, M, F, M, F, F, F, M, F, M, F, F, M, M, F, F, M, F, …
## $ EDUCATION <fct> PhD, High School, High School, <High School, PhD, Bach…
## $ JOB <fct> Professional, Blue Collar, Clerical, Blue Collar, Doct…
## $ TRAVTIME <dbl> 14, 22, 5, 32, 36, 46, 33, 44, 34, 48, 15, 36, 25, 64,…
## $ CAR_USE <fct> Private, Commercial, Private, Private, Private, Commer…
## $ BLUEBOOK <dbl> 14230, 14940, 4010, 15440, 18000, 17430, 8780, 16970, …
## $ TIF <dbl> 11, 1, 4, 7, 1, 1, 1, 1, 1, 7, 1, 7, 7, 6, 1, 6, 6, 7,…
## $ CAR_TYPE <fct> Minivan, Minivan, SUV, Minivan, SUV, Sports Car, SUV, …
## $ RED_CAR <fct> yes, yes, no, yes, no, no, no, yes, no, no, no, no, ye…
## $ OLDCLAIM <dbl> 4461, 0, 38690, 0, 19217, 0, 0, 2374, 0, 0, 0, 0, 5028…
## $ CLM_FREQ <ord> 2, 0, 2, 0, 2, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, …
## $ REVOKED <fct> No, No, No, No, Yes, No, No, Yes, No, No, No, No, Yes,…
## $ MVR_PTS <dbl> 3, 0, 3, 0, 3, 0, 0, 10, 0, 1, 0, 0, 3, 3, 3, 0, 0, 0,…
## $ CAR_AGE <dbl> 18, 1, 10, 6, 17, 7, 1, 7, 1, 17, 11, 1, 9, 10, 5, 13,…
## $ URBANICITY <fct> Highly Urban/ Urban, Highly Urban/ Urban, Highly Urban…
## $ TARGET_AMT_log <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 7.98…
## $ AGE_log <dbl> 4.110874, 3.784190, 3.583519, 3.951244, 3.931826, 3.55…
## $ YOJ_log <dbl> 2.484907, 2.484907, 2.397895, 2.708050, 2.513008, 2.56…
## $ INCOME_log <dbl> 11.117658, 11.423548, 9.682841, 11.115233, 11.652574, …
## $ HOME_VAL_log <dbl> 0.00000, 12.45782, 11.72958, 12.63216, 12.40462, 0.000…
## $ TRAVTIME_log <dbl> 2.708050, 3.135494, 1.791759, 3.496508, 3.610918, 3.85…
## $ BLUEBOOK_log <dbl> 9.563178, 9.611864, 8.296796, 9.644782, 9.798183, 9.76…
## $ TIF_log <dbl> 2.4849066, 0.6931472, 1.6094379, 2.0794415, 0.6931472,…
## $ OLDCLAIM_log <dbl> 8.403352, 0.000000, 10.563362, 0.000000, 9.863603, 0.0…
## $ MVR_PTS_log <dbl> 1.3862944, 0.0000000, 1.3862944, 0.0000000, 1.3862944,…
## $ CAR_AGE_log <dbl> 2.9444390, 0.6931472, 2.3978953, 1.9459101, 2.8903718,…
## $ TARGET_AMT_sqrt <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 54.27707,…
## $ AGE_sqrt <dbl> 7.745967, 6.557439, 5.916080, 7.141428, 7.071068, 5.83…
## $ YOJ_sqrt <dbl> 3.316625, 3.316625, 3.162278, 3.741657, 3.367788, 3.46…
## $ INCOME_sqrt <dbl> 259.5169, 302.4054, 126.6452, 259.2024, 339.0959, 353.…
## $ HOME_VAL_sqrt <dbl> 0.0000, 507.2002, 352.4074, 553.3995, 493.8876, 0.0000…
## $ TRAVTIME_sqrt <dbl> 3.741657, 4.690416, 2.236068, 5.656854, 6.000000, 6.78…
## $ BLUEBOOK_sqrt <dbl> 119.28956, 122.22929, 63.32456, 124.25780, 134.16408, …
## $ TIF_sqrt <dbl> 3.316625, 1.000000, 2.000000, 2.645751, 1.000000, 1.00…
## $ OLDCLAIM_sqrt <dbl> 66.79072, 0.00000, 196.69774, 0.00000, 138.62539, 0.00…
## $ MVR_PTS_sqrt <dbl> 1.732051, 0.000000, 1.732051, 0.000000, 1.732051, 0.00…
## $ CAR_AGE_sqrt <dbl> 4.242641, 1.000000, 3.162278, 2.449490, 4.123106, 2.64…
## $ TARGET_AMT_cbrt <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 14.33544,…
## $ AGE_cbrt <dbl> 3.914868, 3.503398, 3.271066, 3.708430, 3.684031, 3.23…
## $ YOJ_cbrt <dbl> 2.223980, 2.223980, 2.154435, 2.410142, 2.246794, 2.28…
## $ INCOME_cbrt <dbl> 40.68588, 45.05327, 25.21888, 40.65300, 48.62747, 50.0…
## $ HOME_VAL_cbrt <dbl> 0.00000, 63.59939, 49.89190, 67.40506, 62.48159, 0.000…
## $ TRAVTIME_cbrt <dbl> 2.410142, 2.802039, 1.709976, 3.174802, 3.301927, 3.58…
## $ BLUEBOOK_cbrt <dbl> 24.23269, 24.62919, 15.88723, 24.90094, 26.20741, 25.9…
## $ TIF_cbrt <dbl> 2.223980, 1.000000, 1.587401, 1.912931, 1.000000, 1.00…
## $ OLDCLAIM_cbrt <dbl> 16.46180, 0.00000, 33.82202, 0.00000, 26.78522, 0.0000…
## $ MVR_PTS_cbrt <dbl> 1.442250, 0.000000, 1.442250, 0.000000, 1.442250, 0.00…
## $ CAR_AGE_cbrt <dbl> 2.620741, 1.000000, 2.154435, 1.817121, 2.571282, 1.91…
df_ins$HV_INC_RATIO <- df_ins$HOME_VAL / df_ins$INCOME
df_ins$TRT_MVR_PRODUCT <- df_ins$TRAVTIME * df_ins$MVR_PTS
df_ins$HV_INC_RATIO[is.nan(df_ins$HV_INC_RATIO)] <- 0
df_ins$HV_INC_RATIO[is.infinite(df_ins$HV_INC_RATIO)] <- 0
# Impute data by regression:
df_ins <- mice(df_ins, method = "norm.predict", m = 1, remove.collinear=FALSE)
##
## iter imp variable
## 1 1 INCOME_log* HOME_VAL_log* CAR_AGE_log* INCOME_sqrt* HOME_VAL_sqrt* CAR_AGE_sqrt INCOME_cbrt* HOME_VAL_cbrt* CAR_AGE_cbrt*
## 2 1 INCOME_log HOME_VAL_log* CAR_AGE_log* INCOME_sqrt HOME_VAL_sqrt* CAR_AGE_sqrt INCOME_cbrt* HOME_VAL_cbrt* CAR_AGE_cbrt*
## 3 1 INCOME_log* HOME_VAL_log* CAR_AGE_log* INCOME_sqrt HOME_VAL_sqrt* CAR_AGE_sqrt* INCOME_cbrt* HOME_VAL_cbrt* CAR_AGE_cbrt*
## 4 1 INCOME_log* HOME_VAL_log* CAR_AGE_log* INCOME_sqrt HOME_VAL_sqrt* CAR_AGE_sqrt* INCOME_cbrt* HOME_VAL_cbrt* CAR_AGE_cbrt*
## 5 1 INCOME_log* HOME_VAL_log* CAR_AGE_log* INCOME_sqrt HOME_VAL_sqrt* CAR_AGE_sqrt* INCOME_cbrt* HOME_VAL_cbrt* CAR_AGE_cbrt*
## Warning: Number of logged events: 83
df_ins <- complete(df_ins)
# Confirm no NAs remain
colSums(is.na(df_ins))
## TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS
## 0 0 0 0 0
## YOJ INCOME PARENT1 HOME_VAL MSTATUS
## 0 0 0 0 0
## SEX EDUCATION JOB TRAVTIME CAR_USE
## 0 0 0 0 0
## BLUEBOOK TIF CAR_TYPE RED_CAR OLDCLAIM
## 0 0 0 0 0
## CLM_FREQ REVOKED MVR_PTS CAR_AGE URBANICITY
## 0 0 0 0 0
## TARGET_AMT_log AGE_log YOJ_log INCOME_log HOME_VAL_log
## 0 0 0 0 0
## TRAVTIME_log BLUEBOOK_log TIF_log OLDCLAIM_log MVR_PTS_log
## 0 0 0 0 0
## CAR_AGE_log TARGET_AMT_sqrt AGE_sqrt YOJ_sqrt INCOME_sqrt
## 0 0 0 0 0
## HOME_VAL_sqrt TRAVTIME_sqrt BLUEBOOK_sqrt TIF_sqrt OLDCLAIM_sqrt
## 0 0 0 0 0
## MVR_PTS_sqrt CAR_AGE_sqrt TARGET_AMT_cbrt AGE_cbrt YOJ_cbrt
## 0 0 0 0 0
## INCOME_cbrt HOME_VAL_cbrt TRAVTIME_cbrt BLUEBOOK_cbrt TIF_cbrt
## 0 0 0 0 0
## OLDCLAIM_cbrt MVR_PTS_cbrt CAR_AGE_cbrt HV_INC_RATIO TRT_MVR_PRODUCT
## 0 0 0 0 0
# Visualize correlation between variables
corrplot(cor(df_ins_imp %>% keep(is.numeric)), method="shade", shade.col=NA, tl.col="black", tl.srt=45)
# Reshape correlation results
flattenCorrMatrix <- function(cormat, pmat) {
ut <- upper.tri(cormat)
data.frame(
row = rownames(cormat)[row(cormat)[ut]],
column = rownames(cormat)[col(cormat)[ut]],
cor =(cormat)[ut],
p = pmat[ut]
)
}
# Closer look at correlations of variables
corr_results <- rcorr(as.matrix(df_ins_imp %>% keep(is.numeric)))
df_corr <- flattenCorrMatrix(corr_results$r, corr_results$P)
# Noteworthy positive correlations
df_corr %>% filter(cor > 0.4)
## row column cor p
## 1 INCOME HOME_VAL 0.5905964 0
## 2 INCOME BLUEBOOK 0.4347133 0
## 3 INCOME CAR_AGE 0.4259203 0
# Noteworthy negative correlations
df_corr %>% filter(cor < -0.4)
## [1] row column cor p
## <0 rows> (or 0-length row.names)
# Pair plot
pairs(df_ins_imp %>% keep(is.numeric), lower.panel = NULL, col = "steelblue")
model <- glm(TARGET_AMT ~ KIDSDRIV + AGE + YOJ + INCOME + HOME_VAL + TRAVTIME + BLUEBOOK +
TIF + OLDCLAIM + MVR_PTS + CAR_AGE, data = df_ins, family = "quasipoisson")
vif(model)
## GVIF Df GVIF^(1/(2*Df))
## KIDSDRIV 1.024923 4 1.003082
## AGE 1.094995 1 1.046420
## YOJ 1.182086 1 1.087238
## INCOME 1.821517 1 1.349636
## HOME_VAL 1.330625 1 1.153527
## TRAVTIME 1.004598 1 1.002296
## BLUEBOOK 1.229381 1 1.108774
## TIF 1.004250 1 1.002123
## OLDCLAIM 1.080919 1 1.039673
## MVR_PTS 1.097635 1 1.047681
## CAR_AGE 1.252010 1 1.118933
# 0-.25, .25-.75, .75-1
df_ins$CAR_AGE_fact <- cut(x = df_ins$CAR_AGE, breaks = c(-4, 3.5, 12, 28), labels = c("New", "Moderate", "Old"))
# -.5, .5-.9, .9+
df_ins$HOME_VAL_fact <- cut(x = df_ins$HOME_VAL, breaks = c(-86567, 160953, 314151, 885283), labels = c("No or Low", "Moderate", "High"))
# 0-.25, .25-.75, .75-1
df_ins$INCOME_fact <- cut(x = df_ins$INCOME, breaks = c(-31969, 27940, 85472, 367031), labels = c("Low", "Moderate", "High"))
# 0-.5, .50-.75, .75-1
df_ins$MVR_PTS_fact <- cut(x = df_ins$MVR_PTS, breaks = c(-1, 1, 3, 14), labels = c("Low", "Moderate", "High"))
# 0-.75, .75-1
df_ins$OLDCLAIM_fact <- cut(x = df_ins$OLDCLAIM, breaks = c(-1, 4636, 57038), labels = c("Low", "High"))
# 0-.25, .25-.75, .75-1
df_ins$TIF_fact <- cut(x = df_ins$TIF, breaks = c(-1, 1, 7, 26), labels = c("Low", "Moderate", "High"))
# 0-.25, .25-.75, .75-1
df_ins$TRAVTIME_fact <- cut(x = df_ins$TRAVTIME, breaks = c(4, 22, 44, 143), labels = c("Short", "Moderate", "Long"))
# 0-.25, .25-.75, .75-1
df_ins$YOJ_fact <- cut(x = df_ins$YOJ, breaks = c(-1, 9, 13, 24), labels = c("Low", "Moderate", "High"))
# remove set up the data
data_train <- df_ins[-c(2)]
model1 <- glm(TARGET_FLAG ~ 1, data = data_train, family = binomial(link ="logit"))
summary(model1)
##
## Call:
## glm(formula = TARGET_FLAG ~ 1, family = binomial(link = "logit"),
## data = data_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.7827 -0.7827 -0.7827 1.6325 1.6325
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.02623 0.02512 -40.86 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9418 on 8160 degrees of freedom
## Residual deviance: 9418 on 8160 degrees of freedom
## AIC: 9420
##
## Number of Fisher Scoring iterations: 4
model2 <- glm(TARGET_FLAG ~ ., data = data_train, family = binomial(link ="logit"))
summary(model2)
##
## Call:
## glm(formula = TARGET_FLAG ~ ., family = binomial(link = "logit"),
## data = data_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7.097e-06 -2.511e-06 -2.057e-06 2.100e-08 5.914e-05
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.345e+03 2.612e+08 0.000 1.000
## KIDSDRIV1 -1.735e-03 1.775e+04 0.000 1.000
## KIDSDRIV2 1.838e-03 2.415e+04 0.000 1.000
## KIDSDRIV3 3.525e-01 4.134e+04 0.000 1.000
## KIDSDRIV4 6.280e-01 2.109e+05 0.000 1.000
## AGE 1.580e+01 1.331e+06 0.000 1.000
## HOMEKIDS1 -2.659e-01 1.695e+04 0.000 1.000
## HOMEKIDS2 -2.706e-01 1.671e+04 0.000 1.000
## HOMEKIDS3 -2.431e-01 1.965e+04 0.000 1.000
## HOMEKIDS4 -2.266e-01 3.240e+04 0.000 1.000
## HOMEKIDS5 3.096e-01 8.536e+04 0.000 1.000
## YOJ -9.961e-01 1.213e+05 0.000 1.000
## INCOME -2.680e-06 5.815e-01 0.000 1.000
## PARENT1Yes 8.250e-02 1.769e+04 0.000 1.000
## HOME_VAL 1.839e-05 1.682e+00 0.000 1.000
## MSTATUSYes 2.509e-01 1.249e+04 0.000 1.000
## SEXM 2.585e-01 1.448e+04 0.000 1.000
## EDUCATIONBachelors -2.021e-01 1.755e+04 0.000 1.000
## EDUCATIONHigh School 1.813e-01 1.383e+04 0.000 1.000
## EDUCATIONMasters -9.897e-02 2.503e+04 0.000 1.000
## EDUCATIONPhD -9.438e-02 2.969e+04 0.000 1.000
## JOBBlue Collar 3.575e-01 2.507e+04 0.000 1.000
## JOBClerical 1.292e-01 2.699e+04 0.000 1.000
## JOBDoctor 8.911e-02 3.249e+04 0.000 1.000
## JOBHome Maker 2.659e-01 3.086e+04 0.000 1.000
## JOBLawyer 1.140e-01 2.315e+04 0.000 1.000
## JOBManager 1.058e-01 2.243e+04 0.000 1.000
## JOBProfessional 1.443e-01 2.435e+04 0.000 1.000
## JOBStudent 2.491e-01 3.278e+04 0.000 1.000
## TRAVTIME 7.137e-01 1.220e+05 0.000 1.000
## CAR_USEPrivate 1.721e-02 1.262e+04 0.000 1.000
## BLUEBOOK -1.516e-04 5.639e+01 0.000 1.000
## TIF 2.338e-01 1.588e+05 0.000 1.000
## CAR_TYPEPanel Truck 7.391e-02 2.282e+04 0.000 1.000
## CAR_TYPEPickup 2.934e-01 1.304e+04 0.000 1.000
## CAR_TYPESports Car -4.077e-02 1.802e+04 0.000 1.000
## CAR_TYPESUV -4.845e-02 1.490e+04 0.000 1.000
## CAR_TYPEVan 7.548e-03 1.730e+04 0.000 1.000
## RED_CARyes -3.638e-01 1.137e+04 0.000 1.000
## OLDCLAIM -8.819e-04 4.414e+01 0.000 1.000
## CLM_FREQ.L -3.199e+01 1.185e+06 0.000 1.000
## CLM_FREQ.Q 2.932e+01 1.083e+06 0.000 1.000
## CLM_FREQ.C -2.007e+01 7.391e+05 0.000 1.000
## CLM_FREQ^4 1.052e+01 3.747e+05 0.000 1.000
## CLM_FREQ^5 -2.941e+00 1.259e+05 0.000 1.000
## REVOKEDYes 4.190e-02 1.429e+04 0.000 1.000
## MVR_PTS 9.022e-02 4.037e+05 0.000 1.000
## CAR_AGE -8.269e-03 2.020e+04 0.000 1.000
## URBANICITYHighly Urban/ Urban 5.374e-02 1.152e+04 0.000 1.000
## TARGET_AMT_log 2.716e+01 3.854e+04 0.001 0.999
## AGE_log -1.586e+03 1.485e+08 0.000 1.000
## YOJ_log -7.349e+00 1.126e+06 0.000 1.000
## INCOME_log -1.404e-01 1.161e+04 0.000 1.000
## HOME_VAL_log -7.806e-01 7.806e+04 0.000 1.000
## TRAVTIME_log -7.413e+01 1.297e+07 0.000 1.000
## BLUEBOOK_log 6.547e+00 1.163e+06 0.000 1.000
## TIF_log -1.365e+02 8.929e+06 0.000 1.000
## OLDCLAIM_log 1.928e+01 7.313e+05 0.000 1.000
## MVR_PTS_log 1.010e+00 2.415e+06 0.000 1.000
## CAR_AGE_log 8.634e-01 1.316e+05 0.000 1.000
## TARGET_AMT_sqrt 2.433e+00 6.768e+03 0.000 1.000
## AGE_sqrt -1.714e+03 1.527e+08 0.000 1.000
## YOJ_sqrt 1.420e+01 1.802e+06 0.000 1.000
## INCOME_sqrt 4.405e-05 2.644e+02 0.000 1.000
## HOME_VAL_sqrt -7.347e-02 6.282e+03 0.000 1.000
## TRAVTIME_sqrt -7.932e+01 1.377e+07 0.000 1.000
## BLUEBOOK_sqrt 3.858e-01 9.554e+04 0.000 1.000
## TIF_sqrt -1.129e+02 5.309e+06 0.000 1.000
## OLDCLAIM_sqrt 1.650e+00 6.927e+04 0.000 1.000
## MVR_PTS_sqrt -1.493e+00 5.256e+06 0.000 1.000
## CAR_AGE_sqrt 1.435e-01 3.475e+05 0.000 1.000
## TARGET_AMT_cbrt -2.048e+01 4.717e+04 0.000 1.000
## AGE_cbrt 5.556e+03 5.039e+08 0.000 1.000
## YOJ_cbrt -7.775e+00 8.697e+05 0.000 1.000
## INCOME_cbrt 4.954e-02 4.126e+03 0.000 1.000
## HOME_VAL_cbrt 6.556e-01 5.847e+04 0.000 1.000
## TRAVTIME_cbrt 2.584e+02 4.501e+07 0.000 1.000
## BLUEBOOK_cbrt -3.421e+00 7.494e+05 0.000 1.000
## TIF_cbrt 4.177e+02 2.342e+07 0.000 1.000
## OLDCLAIM_cbrt -1.305e+01 5.231e+05 0.000 1.000
## MVR_PTS_cbrt 5.135e-01 3.185e+06 0.000 1.000
## CAR_AGE_cbrt -1.028e+00 4.522e+05 0.000 1.000
## HV_INC_RATIO 3.968e-05 7.142e+01 0.000 1.000
## TRT_MVR_PRODUCT -1.001e-04 1.161e+02 0.000 1.000
## CAR_AGE_factModerate -1.707e-01 3.880e+04 0.000 1.000
## CAR_AGE_factOld -2.867e-01 4.299e+04 0.000 1.000
## HOME_VAL_factModerate 4.997e-01 1.898e+04 0.000 1.000
## HOME_VAL_factHigh 7.148e-01 3.213e+04 0.000 1.000
## INCOME_factModerate -2.264e-01 1.950e+04 0.000 1.000
## INCOME_factHigh -3.339e-01 2.860e+04 0.000 1.000
## MVR_PTS_factModerate -9.545e-02 5.133e+04 0.000 1.000
## MVR_PTS_factHigh -9.431e-02 6.485e+04 0.000 1.000
## OLDCLAIM_factHigh 1.318e+00 2.702e+04 0.000 1.000
## TIF_factModerate -7.702e+00 4.320e+05 0.000 1.000
## TIF_factHigh -7.410e+00 4.342e+05 0.000 1.000
## TRAVTIME_factModerate 1.004e-01 2.075e+04 0.000 1.000
## TRAVTIME_factLong -1.945e-02 2.927e+04 0.000 1.000
## YOJ_factModerate -4.566e-01 1.960e+04 0.000 1.000
## YOJ_factHigh -3.441e-01 3.000e+04 0.000 1.000
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9.4180e+03 on 8160 degrees of freedom
## Residual deviance: 5.1235e-08 on 8062 degrees of freedom
## AIC: 198
##
## Number of Fisher Scoring iterations: 25
plot(model2)
Generate confidence intervals for regression slope
confint.default(model2)
## 2.5 % 97.5 %
## (Intercept) -5.118488e+08 5.118421e+08
## KIDSDRIV1 -3.478957e+04 3.478957e+04
## KIDSDRIV2 -4.733155e+04 4.733156e+04
## KIDSDRIV3 -8.102270e+04 8.102340e+04
## KIDSDRIV4 -4.132803e+05 4.132816e+05
## AGE -2.608372e+06 2.608404e+06
## HOMEKIDS1 -3.321357e+04 3.321303e+04
## HOMEKIDS2 -3.276100e+04 3.276046e+04
## HOMEKIDS3 -3.851237e+04 3.851188e+04
## HOMEKIDS4 -6.349851e+04 6.349805e+04
## HOMEKIDS5 -1.672962e+05 1.672968e+05
## YOJ -2.376768e+05 2.376748e+05
## INCOME -1.139812e+00 1.139806e+00
## PARENT1Yes -3.467866e+04 3.467883e+04
## HOME_VAL -3.296746e+00 3.296783e+00
## MSTATUSYes -2.447699e+04 2.447750e+04
## SEXM -2.838324e+04 2.838375e+04
## EDUCATIONBachelors -3.439030e+04 3.438989e+04
## EDUCATIONHigh School -2.710557e+04 2.710593e+04
## EDUCATIONMasters -4.906690e+04 4.906670e+04
## EDUCATIONPhD -5.819732e+04 5.819714e+04
## JOBBlue Collar -4.914178e+04 4.914249e+04
## JOBClerical -5.290370e+04 5.290395e+04
## JOBDoctor -6.368393e+04 6.368411e+04
## JOBHome Maker -6.049254e+04 6.049307e+04
## JOBLawyer -4.537197e+04 4.537220e+04
## JOBManager -4.396173e+04 4.396195e+04
## JOBProfessional -4.772949e+04 4.772978e+04
## JOBStudent -6.425414e+04 6.425464e+04
## TRAVTIME -2.391640e+05 2.391654e+05
## CAR_USEPrivate -2.473323e+04 2.473327e+04
## BLUEBOOK -1.105181e+02 1.105178e+02
## TIF -3.111941e+05 3.111946e+05
## CAR_TYPEPanel Truck -4.472050e+04 4.472065e+04
## CAR_TYPEPickup -2.554847e+04 2.554906e+04
## CAR_TYPESports Car -3.532592e+04 3.532583e+04
## CAR_TYPESUV -2.919573e+04 2.919563e+04
## CAR_TYPEVan -3.389959e+04 3.389961e+04
## RED_CARyes -2.228563e+04 2.228490e+04
## OLDCLAIM -8.651269e+01 8.651092e+01
## CLM_FREQ.L -2.321779e+06 2.321715e+06
## CLM_FREQ.Q -2.121782e+06 2.121841e+06
## CLM_FREQ.C -1.448711e+06 1.448671e+06
## CLM_FREQ^4 -7.344076e+05 7.344286e+05
## CLM_FREQ^5 -2.468403e+05 2.468344e+05
## REVOKEDYes -2.801319e+04 2.801328e+04
## MVR_PTS -7.912663e+05 7.912665e+05
## CAR_AGE -3.958450e+04 3.958448e+04
## URBANICITYHighly Urban/ Urban -2.258383e+04 2.258394e+04
## TARGET_AMT_log -7.551677e+04 7.557108e+04
## AGE_log -2.910554e+08 2.910522e+08
## YOJ_log -2.206132e+06 2.206117e+06
## INCOME_log -2.275175e+04 2.275147e+04
## HOME_VAL_log -1.529913e+05 1.529897e+05
## TRAVTIME_log -2.541928e+07 2.541913e+07
## BLUEBOOK_log -2.280009e+06 2.280022e+06
## TIF_log -1.749980e+07 1.749952e+07
## OLDCLAIM_log -1.433342e+06 1.433380e+06
## MVR_PTS_log -4.734282e+06 4.734284e+06
## CAR_AGE_log -2.578416e+05 2.578433e+05
## TARGET_AMT_sqrt -1.326338e+04 1.326825e+04
## AGE_sqrt -2.992105e+08 2.992071e+08
## YOJ_sqrt -3.531921e+06 3.531949e+06
## INCOME_sqrt -5.182162e+02 5.182163e+02
## HOME_VAL_sqrt -1.231321e+04 1.231306e+04
## TRAVTIME_sqrt -2.699100e+07 2.699084e+07
## BLUEBOOK_sqrt -1.872507e+05 1.872515e+05
## TIF_sqrt -1.040508e+07 1.040485e+07
## OLDCLAIM_sqrt -1.357719e+05 1.357752e+05
## MVR_PTS_sqrt -1.030107e+07 1.030106e+07
## CAR_AGE_sqrt -6.810229e+05 6.810232e+05
## TARGET_AMT_cbrt -9.246969e+04 9.242873e+04
## AGE_cbrt -9.875794e+08 9.875906e+08
## YOJ_cbrt -1.704594e+06 1.704579e+06
## INCOME_cbrt -8.087103e+03 8.087202e+03
## HOME_VAL_cbrt -1.146009e+05 1.146022e+05
## TRAVTIME_cbrt -8.821839e+07 8.821890e+07
## BLUEBOOK_cbrt -1.468814e+06 1.468807e+06
## TIF_cbrt -4.590341e+07 4.590424e+07
## OLDCLAIM_cbrt -1.025269e+06 1.025243e+06
## MVR_PTS_cbrt -6.242116e+06 6.242117e+06
## CAR_AGE_cbrt -8.863783e+05 8.863763e+05
## HV_INC_RATIO -1.399792e+02 1.399792e+02
## TRT_MVR_PRODUCT -2.275928e+02 2.275926e+02
## CAR_AGE_factModerate -7.604626e+04 7.604592e+04
## CAR_AGE_factOld -8.426315e+04 8.426257e+04
## HOME_VAL_factModerate -3.719063e+04 3.719163e+04
## HOME_VAL_factHigh -6.296905e+04 6.297048e+04
## INCOME_factModerate -3.821705e+04 3.821659e+04
## INCOME_factHigh -5.606057e+04 5.605990e+04
## MVR_PTS_factModerate -1.006057e+05 1.006055e+05
## MVR_PTS_factHigh -1.270942e+05 1.270940e+05
## OLDCLAIM_factHigh -5.294944e+04 5.295208e+04
## TIF_factModerate -8.466392e+05 8.466238e+05
## TIF_factHigh -8.510064e+05 8.509916e+05
## TRAVTIME_factModerate -4.067301e+04 4.067321e+04
## TRAVTIME_factLong -5.736656e+04 5.736652e+04
## YOJ_factModerate -3.841196e+04 3.841105e+04
## YOJ_factHigh -5.879473e+04 5.879405e+04
Generate the odds ratios
exp(coef(model2))
## (Intercept) KIDSDRIV1
## 0.000000e+00 9.982670e-01
## KIDSDRIV2 KIDSDRIV3
## 1.001839e+00 1.422628e+00
## KIDSDRIV4 AGE
## 1.873823e+00 7.310367e+06
## HOMEKIDS1 HOMEKIDS2
## 7.665136e-01 7.629366e-01
## HOMEKIDS3 HOMEKIDS4
## 7.842160e-01 7.972063e-01
## HOMEKIDS5 YOJ
## 1.362944e+00 3.693247e-01
## INCOME PARENT1Yes
## 9.999973e-01 1.085997e+00
## HOME_VAL MSTATUSYes
## 1.000018e+00 1.285141e+00
## SEXM EDUCATIONBachelors
## 1.294976e+00 8.169862e-01
## EDUCATIONHigh School EDUCATIONMasters
## 1.198809e+00 9.057728e-01
## EDUCATIONPhD JOBBlue Collar
## 9.099414e-01 1.429742e+00
## JOBClerical JOBDoctor
## 1.137944e+00 1.093205e+00
## JOBHome Maker JOBLawyer
## 1.304605e+00 1.120785e+00
## JOBManager JOBProfessional
## 1.111590e+00 1.155212e+00
## JOBStudent TRAVTIME
## 1.282826e+00 2.041484e+00
## CAR_USEPrivate BLUEBOOK
## 1.017361e+00 9.998484e-01
## TIF CAR_TYPEPanel Truck
## 1.263384e+00 1.076708e+00
## CAR_TYPEPickup CAR_TYPESports Car
## 1.341014e+00 9.600537e-01
## CAR_TYPESUV CAR_TYPEVan
## 9.527057e-01 1.007577e+00
## RED_CARyes OLDCLAIM
## 6.950107e-01 9.991185e-01
## CLM_FREQ.L CLM_FREQ.Q
## 1.282684e-14 5.428250e+12
## CLM_FREQ.C CLM_FREQ^4
## 1.922168e-09 3.694842e+04
## CLM_FREQ^5 REVOKEDYes
## 5.281858e-02 1.042795e+00
## MVR_PTS CAR_AGE
## 1.094420e+00 9.917648e-01
## URBANICITYHighly Urban/ Urban TARGET_AMT_log
## 1.055209e+00 6.238277e+11
## AGE_log YOJ_log
## 0.000000e+00 6.435035e-04
## INCOME_log HOME_VAL_log
## 8.690197e-01 4.581413e-01
## TRAVTIME_log BLUEBOOK_log
## 6.391237e-33 6.974428e+02
## TIF_log OLDCLAIM_log
## 5.272341e-60 2.369477e+08
## MVR_PTS_log CAR_AGE_log
## 2.746150e+00 2.371163e+00
## TARGET_AMT_sqrt AGE_sqrt
## 1.139636e+01 0.000000e+00
## YOJ_sqrt INCOME_sqrt
## 1.465582e+06 1.000044e+00
## HOME_VAL_sqrt TRAVTIME_sqrt
## 9.291674e-01 3.570723e-35
## BLUEBOOK_sqrt TIF_sqrt
## 1.470762e+00 9.062180e-50
## OLDCLAIM_sqrt MVR_PTS_sqrt
## 5.205367e+00 2.246518e-01
## CAR_AGE_sqrt TARGET_AMT_cbrt
## 1.154281e+00 1.271050e-09
## AGE_cbrt YOJ_cbrt
## Inf 4.202583e-04
## INCOME_cbrt HOME_VAL_cbrt
## 1.050791e+00 1.926318e+00
## TRAVTIME_cbrt BLUEBOOK_cbrt
## 1.623924e+112 3.267791e-02
## TIF_cbrt OLDCLAIM_cbrt
## 2.493477e+181 2.139798e-06
## MVR_PTS_cbrt CAR_AGE_cbrt
## 1.671168e+00 3.575736e-01
## HV_INC_RATIO TRT_MVR_PRODUCT
## 1.000040e+00 9.998999e-01
## CAR_AGE_factModerate CAR_AGE_factOld
## 8.431078e-01 7.507079e-01
## HOME_VAL_factModerate HOME_VAL_factHigh
## 1.648253e+00 2.043822e+00
## INCOME_factModerate INCOME_factHigh
## 7.974285e-01 7.160932e-01
## MVR_PTS_factModerate MVR_PTS_factHigh
## 9.089606e-01 9.100028e-01
## OLDCLAIM_factHigh TIF_factModerate
## 3.736112e+00 4.520045e-04
## TIF_factHigh TRAVTIME_factModerate
## 6.048732e-04 1.105615e+00
## TRAVTIME_factLong YOJ_factModerate
## 9.807391e-01 6.334102e-01
## YOJ_factHigh
## 7.088489e-01
Generate confidence intervals for regression slope
confint.default(model2)
## 2.5 % 97.5 %
## (Intercept) -5.118488e+08 5.118421e+08
## KIDSDRIV1 -3.478957e+04 3.478957e+04
## KIDSDRIV2 -4.733155e+04 4.733156e+04
## KIDSDRIV3 -8.102270e+04 8.102340e+04
## KIDSDRIV4 -4.132803e+05 4.132816e+05
## AGE -2.608372e+06 2.608404e+06
## HOMEKIDS1 -3.321357e+04 3.321303e+04
## HOMEKIDS2 -3.276100e+04 3.276046e+04
## HOMEKIDS3 -3.851237e+04 3.851188e+04
## HOMEKIDS4 -6.349851e+04 6.349805e+04
## HOMEKIDS5 -1.672962e+05 1.672968e+05
## YOJ -2.376768e+05 2.376748e+05
## INCOME -1.139812e+00 1.139806e+00
## PARENT1Yes -3.467866e+04 3.467883e+04
## HOME_VAL -3.296746e+00 3.296783e+00
## MSTATUSYes -2.447699e+04 2.447750e+04
## SEXM -2.838324e+04 2.838375e+04
## EDUCATIONBachelors -3.439030e+04 3.438989e+04
## EDUCATIONHigh School -2.710557e+04 2.710593e+04
## EDUCATIONMasters -4.906690e+04 4.906670e+04
## EDUCATIONPhD -5.819732e+04 5.819714e+04
## JOBBlue Collar -4.914178e+04 4.914249e+04
## JOBClerical -5.290370e+04 5.290395e+04
## JOBDoctor -6.368393e+04 6.368411e+04
## JOBHome Maker -6.049254e+04 6.049307e+04
## JOBLawyer -4.537197e+04 4.537220e+04
## JOBManager -4.396173e+04 4.396195e+04
## JOBProfessional -4.772949e+04 4.772978e+04
## JOBStudent -6.425414e+04 6.425464e+04
## TRAVTIME -2.391640e+05 2.391654e+05
## CAR_USEPrivate -2.473323e+04 2.473327e+04
## BLUEBOOK -1.105181e+02 1.105178e+02
## TIF -3.111941e+05 3.111946e+05
## CAR_TYPEPanel Truck -4.472050e+04 4.472065e+04
## CAR_TYPEPickup -2.554847e+04 2.554906e+04
## CAR_TYPESports Car -3.532592e+04 3.532583e+04
## CAR_TYPESUV -2.919573e+04 2.919563e+04
## CAR_TYPEVan -3.389959e+04 3.389961e+04
## RED_CARyes -2.228563e+04 2.228490e+04
## OLDCLAIM -8.651269e+01 8.651092e+01
## CLM_FREQ.L -2.321779e+06 2.321715e+06
## CLM_FREQ.Q -2.121782e+06 2.121841e+06
## CLM_FREQ.C -1.448711e+06 1.448671e+06
## CLM_FREQ^4 -7.344076e+05 7.344286e+05
## CLM_FREQ^5 -2.468403e+05 2.468344e+05
## REVOKEDYes -2.801319e+04 2.801328e+04
## MVR_PTS -7.912663e+05 7.912665e+05
## CAR_AGE -3.958450e+04 3.958448e+04
## URBANICITYHighly Urban/ Urban -2.258383e+04 2.258394e+04
## TARGET_AMT_log -7.551677e+04 7.557108e+04
## AGE_log -2.910554e+08 2.910522e+08
## YOJ_log -2.206132e+06 2.206117e+06
## INCOME_log -2.275175e+04 2.275147e+04
## HOME_VAL_log -1.529913e+05 1.529897e+05
## TRAVTIME_log -2.541928e+07 2.541913e+07
## BLUEBOOK_log -2.280009e+06 2.280022e+06
## TIF_log -1.749980e+07 1.749952e+07
## OLDCLAIM_log -1.433342e+06 1.433380e+06
## MVR_PTS_log -4.734282e+06 4.734284e+06
## CAR_AGE_log -2.578416e+05 2.578433e+05
## TARGET_AMT_sqrt -1.326338e+04 1.326825e+04
## AGE_sqrt -2.992105e+08 2.992071e+08
## YOJ_sqrt -3.531921e+06 3.531949e+06
## INCOME_sqrt -5.182162e+02 5.182163e+02
## HOME_VAL_sqrt -1.231321e+04 1.231306e+04
## TRAVTIME_sqrt -2.699100e+07 2.699084e+07
## BLUEBOOK_sqrt -1.872507e+05 1.872515e+05
## TIF_sqrt -1.040508e+07 1.040485e+07
## OLDCLAIM_sqrt -1.357719e+05 1.357752e+05
## MVR_PTS_sqrt -1.030107e+07 1.030106e+07
## CAR_AGE_sqrt -6.810229e+05 6.810232e+05
## TARGET_AMT_cbrt -9.246969e+04 9.242873e+04
## AGE_cbrt -9.875794e+08 9.875906e+08
## YOJ_cbrt -1.704594e+06 1.704579e+06
## INCOME_cbrt -8.087103e+03 8.087202e+03
## HOME_VAL_cbrt -1.146009e+05 1.146022e+05
## TRAVTIME_cbrt -8.821839e+07 8.821890e+07
## BLUEBOOK_cbrt -1.468814e+06 1.468807e+06
## TIF_cbrt -4.590341e+07 4.590424e+07
## OLDCLAIM_cbrt -1.025269e+06 1.025243e+06
## MVR_PTS_cbrt -6.242116e+06 6.242117e+06
## CAR_AGE_cbrt -8.863783e+05 8.863763e+05
## HV_INC_RATIO -1.399792e+02 1.399792e+02
## TRT_MVR_PRODUCT -2.275928e+02 2.275926e+02
## CAR_AGE_factModerate -7.604626e+04 7.604592e+04
## CAR_AGE_factOld -8.426315e+04 8.426257e+04
## HOME_VAL_factModerate -3.719063e+04 3.719163e+04
## HOME_VAL_factHigh -6.296905e+04 6.297048e+04
## INCOME_factModerate -3.821705e+04 3.821659e+04
## INCOME_factHigh -5.606057e+04 5.605990e+04
## MVR_PTS_factModerate -1.006057e+05 1.006055e+05
## MVR_PTS_factHigh -1.270942e+05 1.270940e+05
## OLDCLAIM_factHigh -5.294944e+04 5.295208e+04
## TIF_factModerate -8.466392e+05 8.466238e+05
## TIF_factHigh -8.510064e+05 8.509916e+05
## TRAVTIME_factModerate -4.067301e+04 4.067321e+04
## TRAVTIME_factLong -5.736656e+04 5.736652e+04
## YOJ_factModerate -3.841196e+04 3.841105e+04
## YOJ_factHigh -5.879473e+04 5.879405e+04
Generate the odds ratios
exp(coef(model2))
## (Intercept) KIDSDRIV1
## 0.000000e+00 9.982670e-01
## KIDSDRIV2 KIDSDRIV3
## 1.001839e+00 1.422628e+00
## KIDSDRIV4 AGE
## 1.873823e+00 7.310367e+06
## HOMEKIDS1 HOMEKIDS2
## 7.665136e-01 7.629366e-01
## HOMEKIDS3 HOMEKIDS4
## 7.842160e-01 7.972063e-01
## HOMEKIDS5 YOJ
## 1.362944e+00 3.693247e-01
## INCOME PARENT1Yes
## 9.999973e-01 1.085997e+00
## HOME_VAL MSTATUSYes
## 1.000018e+00 1.285141e+00
## SEXM EDUCATIONBachelors
## 1.294976e+00 8.169862e-01
## EDUCATIONHigh School EDUCATIONMasters
## 1.198809e+00 9.057728e-01
## EDUCATIONPhD JOBBlue Collar
## 9.099414e-01 1.429742e+00
## JOBClerical JOBDoctor
## 1.137944e+00 1.093205e+00
## JOBHome Maker JOBLawyer
## 1.304605e+00 1.120785e+00
## JOBManager JOBProfessional
## 1.111590e+00 1.155212e+00
## JOBStudent TRAVTIME
## 1.282826e+00 2.041484e+00
## CAR_USEPrivate BLUEBOOK
## 1.017361e+00 9.998484e-01
## TIF CAR_TYPEPanel Truck
## 1.263384e+00 1.076708e+00
## CAR_TYPEPickup CAR_TYPESports Car
## 1.341014e+00 9.600537e-01
## CAR_TYPESUV CAR_TYPEVan
## 9.527057e-01 1.007577e+00
## RED_CARyes OLDCLAIM
## 6.950107e-01 9.991185e-01
## CLM_FREQ.L CLM_FREQ.Q
## 1.282684e-14 5.428250e+12
## CLM_FREQ.C CLM_FREQ^4
## 1.922168e-09 3.694842e+04
## CLM_FREQ^5 REVOKEDYes
## 5.281858e-02 1.042795e+00
## MVR_PTS CAR_AGE
## 1.094420e+00 9.917648e-01
## URBANICITYHighly Urban/ Urban TARGET_AMT_log
## 1.055209e+00 6.238277e+11
## AGE_log YOJ_log
## 0.000000e+00 6.435035e-04
## INCOME_log HOME_VAL_log
## 8.690197e-01 4.581413e-01
## TRAVTIME_log BLUEBOOK_log
## 6.391237e-33 6.974428e+02
## TIF_log OLDCLAIM_log
## 5.272341e-60 2.369477e+08
## MVR_PTS_log CAR_AGE_log
## 2.746150e+00 2.371163e+00
## TARGET_AMT_sqrt AGE_sqrt
## 1.139636e+01 0.000000e+00
## YOJ_sqrt INCOME_sqrt
## 1.465582e+06 1.000044e+00
## HOME_VAL_sqrt TRAVTIME_sqrt
## 9.291674e-01 3.570723e-35
## BLUEBOOK_sqrt TIF_sqrt
## 1.470762e+00 9.062180e-50
## OLDCLAIM_sqrt MVR_PTS_sqrt
## 5.205367e+00 2.246518e-01
## CAR_AGE_sqrt TARGET_AMT_cbrt
## 1.154281e+00 1.271050e-09
## AGE_cbrt YOJ_cbrt
## Inf 4.202583e-04
## INCOME_cbrt HOME_VAL_cbrt
## 1.050791e+00 1.926318e+00
## TRAVTIME_cbrt BLUEBOOK_cbrt
## 1.623924e+112 3.267791e-02
## TIF_cbrt OLDCLAIM_cbrt
## 2.493477e+181 2.139798e-06
## MVR_PTS_cbrt CAR_AGE_cbrt
## 1.671168e+00 3.575736e-01
## HV_INC_RATIO TRT_MVR_PRODUCT
## 1.000040e+00 9.998999e-01
## CAR_AGE_factModerate CAR_AGE_factOld
## 8.431078e-01 7.507079e-01
## HOME_VAL_factModerate HOME_VAL_factHigh
## 1.648253e+00 2.043822e+00
## INCOME_factModerate INCOME_factHigh
## 7.974285e-01 7.160932e-01
## MVR_PTS_factModerate MVR_PTS_factHigh
## 9.089606e-01 9.100028e-01
## OLDCLAIM_factHigh TIF_factModerate
## 3.736112e+00 4.520045e-04
## TIF_factHigh TRAVTIME_factModerate
## 6.048732e-04 1.105615e+00
## TRAVTIME_factLong YOJ_factModerate
## 9.807391e-01 6.334102e-01
## YOJ_factHigh
## 7.088489e-01
I will select the variables using the stepwise method
The ‘stepAIC’ function in R performs a stepwise model selection with an objective to minimize the AIC value.
Using StepWise in both direction
stepWise_b <- stepAIC(model1, scope = list(upper=model2),direction="both")
## Start: AIC=9419.96
## TARGET_FLAG ~ 1
##
## Df Deviance AIC
## + TARGET_AMT_log 1 0.0 4.0
## + TARGET_AMT_cbrt 1 0.0 4.0
## + TARGET_AMT_sqrt 1 0.0 4.0
## + URBANICITY 1 8915.9 8919.9
## + CLM_FREQ 5 8948.2 8960.2
## + OLDCLAIM_log 1 8959.2 8963.2
## + OLDCLAIM_cbrt 1 9035.1 9039.1
## + MVR_PTS 1 9049.6 9053.6
## + TRT_MVR_PRODUCT 1 9083.1 9087.1
## + MVR_PTS_sqrt 1 9092.5 9096.5
## + MVR_PTS_log 1 9093.3 9097.3
## + OLDCLAIM_sqrt 1 9111.6 9115.6
## + MVR_PTS_cbrt 1 9128.4 9132.4
## + HOME_VAL 1 9130.2 9134.2
## + MVR_PTS_fact 2 9133.3 9139.3
## + HOME_VAL_sqrt 1 9158.9 9162.9
## + JOB 8 9145.1 9163.1
## + HOME_VAL_cbrt 1 9184.1 9188.1
## + HOME_VAL_fact 2 9188.5 9194.5
## + OLDCLAIM_fact 1 9198.4 9202.4
## + PARENT1 1 9232.4 9236.4
## + HOME_VAL_log 1 9235.0 9239.0
## + INCOME_fact 2 9236.9 9242.9
## + INCOME 1 9240.0 9244.0
## + REVOKED 1 9245.9 9249.9
## + CAR_TYPE 5 9237.9 9249.9
## + INCOME_sqrt 1 9248.5 9252.5
## + EDUCATION 4 9246.5 9256.5
## + CAR_USE 1 9255.0 9259.0
## + MSTATUS 1 9270.8 9274.8
## + OLDCLAIM 1 9277.5 9281.5
## + HOMEKIDS 5 9283.6 9295.6
## + AGE_log 1 9306.7 9310.7
## + AGE_cbrt 1 9313.3 9317.3
## + BLUEBOOK_cbrt 1 9314.8 9318.8
## + BLUEBOOK_sqrt 1 9316.2 9320.2
## + AGE_sqrt 1 9317.1 9321.1
## + BLUEBOOK_log 1 9317.7 9321.7
## + CAR_AGE 1 9325.0 9329.0
## + BLUEBOOK 1 9327.5 9331.5
## + CAR_AGE_fact 2 9326.3 9332.3
## + AGE 1 9328.8 9332.8
## + CAR_AGE_sqrt 1 9330.3 9334.3
## + CAR_AGE_cbrt 1 9334.0 9338.0
## + CAR_AGE_log 1 9335.3 9339.3
## + KIDSDRIV 4 9331.9 9341.9
## + TIF_sqrt 1 9359.1 9363.1
## + TIF_cbrt 1 9359.7 9363.7
## + TIF_log 1 9360.3 9364.3
## + TIF 1 9360.8 9364.8
## + YOJ_cbrt 1 9364.1 9368.1
## + YOJ_log 1 9365.6 9369.6
## + YOJ_sqrt 1 9366.5 9370.5
## + TIF_fact 2 9368.2 9374.2
## + YOJ 1 9375.7 9379.7
## + TRAVTIME_log 1 9390.2 9394.2
## + TRAVTIME_cbrt 1 9392.1 9396.1
## + TRAVTIME_sqrt 1 9393.6 9397.6
## + TRAVTIME_fact 2 9392.8 9398.8
## + YOJ_fact 2 9394.0 9400.0
## + TRAVTIME 1 9399.0 9403.0
## + INCOME_cbrt 1 9412.0 9416.0
## + SEX 1 9414.3 9418.3
## <none> 9418.0 9420.0
## + INCOME_log 1 9416.5 9420.5
## + HV_INC_RATIO 1 9417.5 9421.5
## + RED_CAR 1 9417.6 9421.6
##
## Step: AIC=4
## TARGET_FLAG ~ TARGET_AMT_log
##
## Df Deviance AIC
## <none> 0 4
## + TARGET_AMT_cbrt 1 0 6
## + TARGET_AMT_sqrt 1 0 6
## + MVR_PTS 1 0 6
## + TRT_MVR_PRODUCT 1 0 6
## + MVR_PTS_log 1 0 6
## + MVR_PTS_sqrt 1 0 6
## + MVR_PTS_cbrt 1 0 6
## + PARENT1 1 0 6
## + REVOKED 1 0 6
## + BLUEBOOK 1 0 6
## + BLUEBOOK_sqrt 1 0 6
## + BLUEBOOK_cbrt 1 0 6
## + BLUEBOOK_log 1 0 6
## + MSTATUS 1 0 6
## + CAR_AGE 1 0 6
## + OLDCLAIM_log 1 0 6
## + CAR_AGE_sqrt 1 0 6
## + CAR_AGE_cbrt 1 0 6
## + CAR_AGE_log 1 0 6
## + INCOME 1 0 6
## + OLDCLAIM 1 0 6
## + INCOME_sqrt 1 0 6
## + OLDCLAIM_cbrt 1 0 6
## + OLDCLAIM_sqrt 1 0 6
## + SEX 1 0 6
## + RED_CAR 1 0 6
## + TIF_log 1 0 6
## + TIF_cbrt 1 0 6
## + YOJ_cbrt 1 0 6
## + YOJ_log 1 0 6
## + HOME_VAL_cbrt 1 0 6
## + TRAVTIME_sqrt 1 0 6
## + TRAVTIME 1 0 6
## + TRAVTIME_cbrt 1 0 6
## + INCOME_cbrt 1 0 6
## + HV_INC_RATIO 1 0 6
## + INCOME_log 1 0 6
## + TIF_sqrt 1 0 6
## + TRAVTIME_log 1 0 6
## + YOJ_sqrt 1 0 6
## + URBANICITY 1 0 6
## + TIF 1 0 6
## + HOME_VAL_sqrt 1 0 6
## + HOME_VAL 1 0 6
## + CAR_USE 1 0 6
## + OLDCLAIM_fact 1 0 6
## + AGE 1 0 6
## + YOJ 1 0 6
## + AGE_sqrt 1 0 6
## + AGE_cbrt 1 0 6
## + HOME_VAL_log 1 0 6
## + AGE_log 1 0 6
## + MVR_PTS_fact 2 0 8
## + TIF_fact 2 0 8
## + INCOME_fact 2 0 8
## + HOME_VAL_fact 2 0 8
## + CAR_AGE_fact 2 0 8
## + TRAVTIME_fact 2 0 8
## + YOJ_fact 2 0 8
## + EDUCATION 4 0 12
## + KIDSDRIV 4 0 12
## + CLM_FREQ 5 0 14
## + CAR_TYPE 5 0 14
## + HOMEKIDS 5 0 14
## + JOB 8 0 20
## - TARGET_AMT_log 1 9418 9420
Create histogram of the Probability
hist(stepWise_b$fitted.values, main = " Histogram ",xlab = "Probability", col = 'skyblue3')
Show the predicted values
df_ins$Predict <- ifelse(stepWise_b$fitted.values >0.5,"pos","neg")
head(df_ins$Predict)
## [1] "neg" "neg" "neg" "neg" "neg" "pos"