library(tidyverse)
library(ggplot2)
library(mice)
library(car)
library(Hmisc)
library(corrplot)
library(pscl)
library(boot)
library(nlme)
# Load insurance csv
df_ins_raw <- read.csv("insurance_training_data.csv")
# Removing index as instructed
df_ins_raw <- subset(df_ins_raw, select = -c(INDEX))
# Preview data
glimpse(df_ins_raw)
## Rows: 8,161
## Columns: 25
## $ TARGET_FLAG <int> 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1…
## $ TARGET_AMT <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 2946.000, 0.000, 4021.0…
## $ KIDSDRIV <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ AGE <int> 60, 43, 35, 51, 50, 34, 54, 37, 34, 50, 53, 43, 55, 53, 45…
## $ HOMEKIDS <int> 0, 0, 1, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3, 2, 1…
## $ YOJ <int> 11, 11, 10, 14, NA, 12, NA, NA, 10, 7, 14, 5, 11, 11, 0, 1…
## $ INCOME <chr> "$67,349", "$91,449", "$16,039", "", "$114,986", "$125,301…
## $ PARENT1 <chr> "No", "No", "No", "No", "No", "Yes", "No", "No", "No", "No…
## $ HOME_VAL <chr> "$0", "$257,252", "$124,191", "$306,251", "$243,925", "$0"…
## $ MSTATUS <chr> "z_No", "z_No", "Yes", "Yes", "Yes", "z_No", "Yes", "Yes",…
## $ SEX <chr> "M", "M", "z_F", "M", "z_F", "z_F", "z_F", "M", "z_F", "M"…
## $ EDUCATION <chr> "PhD", "z_High School", "z_High School", "<High School", "…
## $ JOB <chr> "Professional", "z_Blue Collar", "Clerical", "z_Blue Colla…
## $ TRAVTIME <int> 14, 22, 5, 32, 36, 46, 33, 44, 34, 48, 15, 36, 25, 64, 48,…
## $ CAR_USE <chr> "Private", "Commercial", "Private", "Private", "Private", …
## $ BLUEBOOK <chr> "$14,230", "$14,940", "$4,010", "$15,440", "$18,000", "$17…
## $ TIF <int> 11, 1, 4, 7, 1, 1, 1, 1, 1, 7, 1, 7, 7, 6, 1, 6, 6, 7, 4, …
## $ CAR_TYPE <chr> "Minivan", "Minivan", "z_SUV", "Minivan", "z_SUV", "Sports…
## $ RED_CAR <chr> "yes", "yes", "no", "yes", "no", "no", "no", "yes", "no", …
## $ OLDCLAIM <chr> "$4,461", "$0", "$38,690", "$0", "$19,217", "$0", "$0", "$…
## $ CLM_FREQ <int> 2, 0, 2, 0, 2, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2…
## $ REVOKED <chr> "No", "No", "No", "No", "Yes", "No", "No", "Yes", "No", "N…
## $ MVR_PTS <int> 3, 0, 3, 0, 3, 0, 0, 10, 0, 1, 0, 0, 3, 3, 3, 0, 0, 0, 0, …
## $ CAR_AGE <int> 18, 1, 10, 6, 17, 7, 1, 7, 1, 17, 11, 1, 9, 10, 5, 13, 16,…
## $ URBANICITY <chr> "Highly Urban/ Urban", "Highly Urban/ Urban", "Highly Urba…
remove_z <- function(x){
str_replace(x, 'z_', '')
}
# Remove extraneous z_
df_ins_raw <- mutate_all(df_ins_raw, funs(remove_z))
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
remove_dollar <- function(x){
str_replace(x, '\\$', '')
}
# Remove dollar sign from variables
df_ins_raw <- mutate_all(df_ins_raw, funs(remove_dollar))
remove_comma <- function(x){
str_replace(x, ',', '')
}
# Remove commas from variables
df_ins_raw <- mutate_all(df_ins_raw, funs(remove_comma))
# Preview updated data
glimpse(df_ins_raw)
## Rows: 8,161
## Columns: 25
## $ TARGET_FLAG <chr> "0", "0", "0", "0", "0", "1", "0", "1", "1", "0", "1", "0"…
## $ TARGET_AMT <chr> "0", "0", "0", "0", "0", "2946", "0", "4021", "2501", "0",…
## $ KIDSDRIV <chr> "0", "0", "0", "0", "0", "0", "0", "1", "0", "0", "0", "0"…
## $ AGE <chr> "60", "43", "35", "51", "50", "34", "54", "37", "34", "50"…
## $ HOMEKIDS <chr> "0", "0", "1", "0", "0", "1", "0", "2", "0", "0", "0", "0"…
## $ YOJ <chr> "11", "11", "10", "14", NA, "12", NA, NA, "10", "7", "14",…
## $ INCOME <chr> "67349", "91449", "16039", "", "114986", "125301", "18755"…
## $ PARENT1 <chr> "No", "No", "No", "No", "No", "Yes", "No", "No", "No", "No…
## $ HOME_VAL <chr> "0", "257252", "124191", "306251", "243925", "0", "", "333…
## $ MSTATUS <chr> "No", "No", "Yes", "Yes", "Yes", "No", "Yes", "Yes", "No",…
## $ SEX <chr> "M", "M", "F", "M", "F", "F", "F", "M", "F", "M", "F", "F"…
## $ EDUCATION <chr> "PhD", "High School", "High School", "<High School", "PhD"…
## $ JOB <chr> "Professional", "Blue Collar", "Clerical", "Blue Collar", …
## $ TRAVTIME <chr> "14", "22", "5", "32", "36", "46", "33", "44", "34", "48",…
## $ CAR_USE <chr> "Private", "Commercial", "Private", "Private", "Private", …
## $ BLUEBOOK <chr> "14230", "14940", "4010", "15440", "18000", "17430", "8780…
## $ TIF <chr> "11", "1", "4", "7", "1", "1", "1", "1", "1", "7", "1", "7…
## $ CAR_TYPE <chr> "Minivan", "Minivan", "SUV", "Minivan", "SUV", "Sports Car…
## $ RED_CAR <chr> "yes", "yes", "no", "yes", "no", "no", "no", "yes", "no", …
## $ OLDCLAIM <chr> "4461", "0", "38690", "0", "19217", "0", "0", "2374", "0",…
## $ CLM_FREQ <chr> "2", "0", "2", "0", "2", "0", "0", "1", "0", "0", "0", "0"…
## $ REVOKED <chr> "No", "No", "No", "No", "Yes", "No", "No", "Yes", "No", "N…
## $ MVR_PTS <chr> "3", "0", "3", "0", "3", "0", "0", "10", "0", "1", "0", "0…
## $ CAR_AGE <chr> "18", "1", "10", "6", "17", "7", "1", "7", "1", "17", "11"…
## $ URBANICITY <chr> "Highly Urban/ Urban", "Highly Urban/ Urban", "Highly Urba…
# Count of distinct values for each column
df_ins_raw %>% summarise_all(n_distinct)
## TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ INCOME PARENT1 HOME_VAL
## 1 2 1949 5 61 6 22 6613 2 5107
## MSTATUS SEX EDUCATION JOB TRAVTIME CAR_USE BLUEBOOK TIF CAR_TYPE RED_CAR
## 1 2 2 5 9 97 2 2789 23 6 2
## OLDCLAIM CLM_FREQ REVOKED MVR_PTS CAR_AGE URBANICITY
## 1 2857 6 2 13 31 2
df_ins_raw %>% distinct(PARENT1)
## PARENT1
## 1 No
## 2 Yes
df_ins_raw %>% distinct(MSTATUS)
## MSTATUS
## 1 No
## 2 Yes
df_ins_raw %>% distinct(SEX)
## SEX
## 1 M
## 2 F
df_ins_raw %>% distinct(EDUCATION)
## EDUCATION
## 1 PhD
## 2 High School
## 3 <High School
## 4 Bachelors
## 5 Masters
df_ins_raw %>% distinct(JOB)
## JOB
## 1 Professional
## 2 Blue Collar
## 3 Clerical
## 4 Doctor
## 5 Lawyer
## 6 Manager
## 7
## 8 Home Maker
## 9 Student
df_ins_raw %>% distinct(CAR_USE)
## CAR_USE
## 1 Private
## 2 Commercial
df_ins_raw %>% distinct(CAR_TYPE)
## CAR_TYPE
## 1 Minivan
## 2 SUV
## 3 Sports Car
## 4 Van
## 5 Panel Truck
## 6 Pickup
df_ins_raw %>% distinct(CLM_FREQ)
## CLM_FREQ
## 1 2
## 2 0
## 3 1
## 4 3
## 5 5
## 6 4
df_ins_raw %>% distinct(REVOKED)
## REVOKED
## 1 No
## 2 Yes
df_ins_raw %>% distinct(URBANICITY)
## URBANICITY
## 1 Highly Urban/ Urban
## 2 Highly Rural/ Rural
# Set data types for variables
df_ins_clean <- df_ins_raw %>% transform(
TARGET_FLAG = as.factor(TARGET_FLAG),
TARGET_AMT = as.numeric(TARGET_AMT),
KIDSDRIV = as.factor(KIDSDRIV),
AGE = as.numeric(AGE),
HOMEKIDS = as.factor(HOMEKIDS),
YOJ = as.numeric(YOJ),
INCOME = as.numeric(INCOME),
PARENT1 = as.factor(PARENT1),
HOME_VAL = as.numeric(HOME_VAL),
MSTATUS = as.factor(MSTATUS),
SEX = as.factor(SEX),
EDUCATION = as.factor(EDUCATION),
JOB = as.factor(JOB),
TRAVTIME = as.numeric(TRAVTIME),
CAR_USE = as.factor(CAR_USE),
BLUEBOOK = as.numeric(BLUEBOOK),
TIF = as.numeric(TIF), # factor or numeric?
CAR_TYPE = as.factor(CAR_TYPE),
RED_CAR = as.factor(RED_CAR),
OLDCLAIM = as.numeric(OLDCLAIM),
CLM_FREQ = as.ordered(CLM_FREQ), # factor or numeric?
REVOKED = as.factor(REVOKED),
MVR_PTS = as.numeric(MVR_PTS),
CAR_AGE = as.numeric(CAR_AGE),
URBANICITY = as.factor(URBANICITY))
# Confirm CLM_FREQ is an ordered factor
is.ordered(df_ins_clean$CLM_FREQ)
## [1] TRUE
# NA counts for each column
colSums(is.na(df_ins_clean))
## TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ
## 0 0 0 6 0 454
## INCOME PARENT1 HOME_VAL MSTATUS SEX EDUCATION
## 445 0 464 0 0 0
## JOB TRAVTIME CAR_USE BLUEBOOK TIF CAR_TYPE
## 0 0 0 0 0 0
## RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS CAR_AGE
## 0 0 0 0 0 510
## URBANICITY
## 0
# Visualize NA counts for each column
df_ins_clean %>%
summarise_all(list(~is.na(.)))%>%
pivot_longer(everything(),
names_to = "variables", values_to="missing") %>%
count(variables, missing) %>%
ggplot(aes(y=variables,x=n,fill=missing))+
geom_col()
# Impute data by regression:
df_ins_imp <- mice(df_ins_clean, method = "norm.predict", m = 1, remove.collinear=FALSE)
##
## iter imp variable
## 1 1 AGE YOJ INCOME HOME_VAL CAR_AGE
## 2 1 AGE YOJ INCOME HOME_VAL CAR_AGE
## 3 1 AGE YOJ INCOME HOME_VAL CAR_AGE
## 4 1 AGE YOJ INCOME HOME_VAL CAR_AGE
## 5 1 AGE YOJ INCOME HOME_VAL CAR_AGE
df_ins_imp <- complete(df_ins_imp)
# Confirm no NAs remain
colSums(is.na(df_ins_imp))
## TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ
## 0 0 0 0 0 0
## INCOME PARENT1 HOME_VAL MSTATUS SEX EDUCATION
## 0 0 0 0 0 0
## JOB TRAVTIME CAR_USE BLUEBOOK TIF CAR_TYPE
## 0 0 0 0 0 0
## RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS CAR_AGE
## 0 0 0 0 0 0
## URBANICITY
## 0
describe(df_ins_imp)
## df_ins_imp
##
## 25 Variables 8161 Observations
## --------------------------------------------------------------------------------
## TARGET_FLAG
## n missing distinct
## 8161 0 2
##
## Value 0 1
## Frequency 6008 2153
## Proportion 0.736 0.264
## --------------------------------------------------------------------------------
## TARGET_AMT
## n missing distinct Info Mean Gmd .05 .10
## 8161 0 1949 0.601 1504 2574 0 0
## .25 .50 .75 .90 .95
## 0 0 1036 4904 6452
##
## lowest : 0.00000 30.27728 58.53106 95.56732 108.74150
## highest: 73783.46592 77907.43028 78874.19056 85523.65335 107586.13616
## --------------------------------------------------------------------------------
## KIDSDRIV
## n missing distinct
## 8161 0 5
##
## lowest : 0 1 2 3 4, highest: 0 1 2 3 4
##
## Value 0 1 2 3 4
## Frequency 7180 636 279 62 4
## Proportion 0.880 0.078 0.034 0.008 0.000
## --------------------------------------------------------------------------------
## AGE
## n missing distinct Info Mean Gmd .05 .10
## 8161 0 66 0.999 44.78 9.749 30 34
## .25 .50 .75 .90 .95
## 39 45 51 56 59
##
## lowest : 16 17 18 19 20, highest: 72 73 76 80 81
## --------------------------------------------------------------------------------
## HOMEKIDS
## n missing distinct
## 8161 0 6
##
## lowest : 0 1 2 3 4, highest: 1 2 3 4 5
##
## Value 0 1 2 3 4 5
## Frequency 5289 902 1118 674 164 14
## Proportion 0.648 0.111 0.137 0.083 0.020 0.002
## --------------------------------------------------------------------------------
## YOJ
## n missing distinct Info Mean Gmd .05 .10
## 8161 0 475 0.991 10.5 4.2 0 5
## .25 .50 .75 .90 .95
## 9 11 13 14 15
##
## lowest : 0.000000 1.000000 2.000000 2.748152 3.000000
## highest: 16.348241 17.000000 18.000000 19.000000 23.000000
## --------------------------------------------------------------------------------
## INCOME
## n missing distinct Info Mean Gmd .05 .10
## 8161 0 7057 1 61594 51009 0 4362
## .25 .50 .75 .90 .95
## 27940 54216 85472 122744 151663
##
## lowest : -31968.53 -26991.28 -20478.35 -16829.41 -16713.99
## highest: 306277.00 309628.00 320127.00 332339.00 367030.00
## --------------------------------------------------------------------------------
## PARENT1
## n missing distinct
## 8161 0 2
##
## Value No Yes
## Frequency 7084 1077
## Proportion 0.868 0.132
## --------------------------------------------------------------------------------
## HOME_VAL
## n missing distinct Info Mean Gmd .05 .10
## 8161 0 5570 0.978 154934 142397 0 0
## .25 .50 .75 .90 .95
## 0 160953 237604 314151 373031
##
## lowest : -86554.85 -71570.86 -71165.20 -70648.14 -68150.78
## highest: 657804.00 682634.00 738153.00 750455.00 885282.00
## --------------------------------------------------------------------------------
## MSTATUS
## n missing distinct
## 8161 0 2
##
## Value No Yes
## Frequency 3267 4894
## Proportion 0.4 0.6
## --------------------------------------------------------------------------------
## SEX
## n missing distinct
## 8161 0 2
##
## Value F M
## Frequency 4375 3786
## Proportion 0.536 0.464
## --------------------------------------------------------------------------------
## EDUCATION
## n missing distinct
## 8161 0 5
##
## lowest : <High School Bachelors High School Masters PhD
## highest: <High School Bachelors High School Masters PhD
##
## Value <High School Bachelors High School Masters PhD
## Frequency 1203 2242 2330 1658 728
## Proportion 0.147 0.275 0.286 0.203 0.089
## --------------------------------------------------------------------------------
## JOB
## n missing distinct
## 8161 0 9
##
## lowest : Blue Collar Clerical Doctor Home Maker
## highest: Home Maker Lawyer Manager Professional Student
##
## Value Blue Collar Clerical Doctor Home Maker
## Frequency 526 1825 1271 246 641
## Proportion 0.064 0.224 0.156 0.030 0.079
##
## Value Lawyer Manager Professional Student
## Frequency 835 988 1117 712
## Proportion 0.102 0.121 0.137 0.087
## --------------------------------------------------------------------------------
## TRAVTIME
## n missing distinct Info Mean Gmd .05 .10
## 8161 0 97 1 33.49 17.85 7 13
## .25 .50 .75 .90 .95
## 22 33 44 54 60
##
## lowest : 5 6 7 8 9, highest: 103 113 124 134 142
## --------------------------------------------------------------------------------
## CAR_USE
## n missing distinct
## 8161 0 2
##
## Value Commercial Private
## Frequency 3029 5132
## Proportion 0.371 0.629
## --------------------------------------------------------------------------------
## BLUEBOOK
## n missing distinct Info Mean Gmd .05 .10
## 8161 0 2789 1 15710 9354 4900 6000
## .25 .50 .75 .90 .95
## 9280 14440 20850 27460 31110
##
## lowest : 1500 1520 1530 1540 1590, highest: 57970 61050 62240 65970 69740
## --------------------------------------------------------------------------------
## TIF
## n missing distinct Info Mean Gmd .05 .10
## 8161 0 23 0.961 5.351 4.512 1 1
## .25 .50 .75 .90 .95
## 1 4 7 11 13
##
## lowest : 1 2 3 4 5, highest: 19 20 21 22 25
## --------------------------------------------------------------------------------
## CAR_TYPE
## n missing distinct
## 8161 0 6
##
## lowest : Minivan Panel Truck Pickup Sports Car SUV
## highest: Panel Truck Pickup Sports Car SUV Van
##
## Value Minivan Panel Truck Pickup Sports Car SUV
## Frequency 2145 676 1389 907 2294
## Proportion 0.263 0.083 0.170 0.111 0.281
##
## Value Van
## Frequency 750
## Proportion 0.092
## --------------------------------------------------------------------------------
## RED_CAR
## n missing distinct
## 8161 0 2
##
## Value no yes
## Frequency 5783 2378
## Proportion 0.709 0.291
## --------------------------------------------------------------------------------
## OLDCLAIM
## n missing distinct Info Mean Gmd .05 .10
## 8161 0 2857 0.769 4037 6563 0 0
## .25 .50 .75 .90 .95
## 0 0 4636 9583 27090
##
## lowest : 0 502 506 518 519, highest: 52507 53477 53568 53986 57037
## --------------------------------------------------------------------------------
## CLM_FREQ
## n missing distinct
## 8161 0 6
##
## lowest : 0 1 2 3 4, highest: 1 2 3 4 5
##
## Value 0 1 2 3 4 5
## Frequency 5009 997 1171 776 190 18
## Proportion 0.614 0.122 0.143 0.095 0.023 0.002
## --------------------------------------------------------------------------------
## REVOKED
## n missing distinct
## 8161 0 2
##
## Value No Yes
## Frequency 7161 1000
## Proportion 0.877 0.123
## --------------------------------------------------------------------------------
## MVR_PTS
## n missing distinct Info Mean Gmd .05 .10
## 8161 0 13 0.9 1.696 2.187 0 0
## .25 .50 .75 .90 .95
## 0 1 3 5 6
##
## lowest : 0 1 2 3 4, highest: 8 9 10 11 13
##
## Value 0 1 2 3 4 5 6 7 8 9 10
## Frequency 3712 1157 948 758 599 399 266 167 84 45 13
## Proportion 0.455 0.142 0.116 0.093 0.073 0.049 0.033 0.020 0.010 0.006 0.002
##
## Value 11 13
## Frequency 11 2
## Proportion 0.001 0.000
## --------------------------------------------------------------------------------
## CAR_AGE
## n missing distinct Info Mean Gmd .05 .10
## 8161 0 540 0.985 8.347 6.374 1.000 1.000
## .25 .50 .75 .90 .95
## 3.514 8.000 12.000 16.000 18.000
##
## lowest : -3.0000000 -1.1184514 -0.9423262 0.0000000 1.0000000
## highest: 24.0000000 25.0000000 26.0000000 27.0000000 28.0000000
## --------------------------------------------------------------------------------
## URBANICITY
## n missing distinct
## 8161 0 2
##
## Value Highly Rural/ Rural Highly Urban/ Urban
## Frequency 1669 6492
## Proportion 0.205 0.795
## --------------------------------------------------------------------------------
# Histograms
df_ins_imp %>%
keep(is.numeric) %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~ key, scales = "free") +
geom_density(fill = "steelblue", alpha=0.9, color="steelblue") +
geom_histogram(aes(y=..density..), alpha=0.5, fill = "lightblue", color="lightblue", position="identity")
# Boxplots
df_ins_imp %>%
keep(is.numeric) %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~ key, scales = "free") +
geom_boxplot(fill = "steelblue", color="black", outlier.colour="red", outlier.shape=16,
outlier.size=2, notch=FALSE)
# Log transformation
df_ins_imp_log <- df_ins_imp %>% keep(is.numeric)
df_ins_imp_log <- log(df_ins_imp_log + 1)
# Histograms of log transformed numeric variables
df_ins_imp_log %>%
gather(variable, value, TARGET_AMT:CAR_AGE) %>%
ggplot(., aes(value)) +
geom_density(fill = "steelblue", color="steelblue") +
facet_wrap(~variable, scales ="free", ncol = 4) +
labs(x = element_blank(), y = element_blank())
# Test for normality
shapiro.test(df_ins_imp_log$AGE[0:5000])
##
## Shapiro-Wilk normality test
##
## data: df_ins_imp_log$AGE[0:5000]
## W = 0.97949, p-value < 2.2e-16
# Visual inspection of one variable (age) for normality
qqnorm(df_ins_imp_log$AGE, pch = 1, frame = FALSE)
qqline(df_ins_imp_log$AGE, col = "steelblue", lwd = 2)
# Square root transformation
df_ins_imp_sqrt <- sqrt(df_ins_imp %>% keep(is.numeric))
# Histograms of square root transformed numeric variables
df_ins_imp_sqrt %>%
gather(variable, value, TARGET_AMT:CAR_AGE) %>%
ggplot(., aes(value)) +
geom_density(fill = "steelblue", color="steelblue") +
facet_wrap(~variable, scales ="free", ncol = 4) +
labs(x = element_blank(), y = element_blank())
# Test for normality
shapiro.test(df_ins_imp_sqrt$AGE[0:5000])
##
## Shapiro-Wilk normality test
##
## data: df_ins_imp_sqrt$AGE[0:5000]
## W = 0.99371, p-value = 5.047e-14
# Visual inspection of one variable (age) for normality
qqnorm(df_ins_imp_sqrt$AGE, pch = 1, frame = FALSE)
qqline(df_ins_imp_sqrt$AGE, col = "steelblue", lwd = 2)
# Cube root transformation
df_ins_imp_cube <- (df_ins_imp %>% keep(is.numeric))^(1/3)
# Histograms of cube root transformed numeric variables
df_ins_imp_cube %>%
gather(variable, value, TARGET_AMT:CAR_AGE) %>%
ggplot(., aes(value)) +
geom_density(fill = "steelblue", color="steelblue") +
facet_wrap(~variable, scales ="free", ncol = 4) +
labs(x = element_blank(), y = element_blank())
# Test for normality
shapiro.test(df_ins_imp_cube$AGE[0:5000])
##
## Shapiro-Wilk normality test
##
## data: df_ins_imp_cube$AGE[0:5000]
## W = 0.98989, p-value < 2.2e-16
# Visual inspection of one variable (age) for normality
qqnorm(df_ins_imp_cube$AGE, pch = 1, frame = FALSE)
qqline(df_ins_imp_cube$AGE, col = "steelblue", lwd = 2)
df_ins <- df_ins_imp %>%
mutate(across(c(TARGET_AMT, AGE, YOJ, INCOME, HOME_VAL, TRAVTIME, BLUEBOOK,
TIF, OLDCLAIM, MVR_PTS, CAR_AGE), .fns = list(log = ~ log(. + 1))))
df_ins <- df_ins %>%
mutate(across(c(TARGET_AMT, AGE, YOJ, INCOME, HOME_VAL, TRAVTIME, BLUEBOOK,
TIF, OLDCLAIM, MVR_PTS, CAR_AGE), .fns = list(sqrt = sqrt)))
df_ins <- df_ins %>%
mutate(across(c(TARGET_AMT, AGE, YOJ, INCOME, HOME_VAL, TRAVTIME, BLUEBOOK,
TIF, OLDCLAIM, MVR_PTS, CAR_AGE), .fns = list(cbrt = ~ .^(1/3))))
glimpse(df_ins)
## Rows: 8,161
## Columns: 58
## $ TARGET_FLAG <fct> 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, …
## $ TARGET_AMT <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 2946.000, 0.000, 40…
## $ KIDSDRIV <fct> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ AGE <dbl> 60, 43, 35, 51, 50, 34, 54, 37, 34, 50, 53, 43, 55, 53…
## $ HOMEKIDS <fct> 0, 0, 1, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3, …
## $ YOJ <dbl> 11.00000, 11.00000, 10.00000, 14.00000, 11.34200, 12.0…
## $ INCOME <dbl> 67349.00, 91449.00, 16039.00, 67185.86, 114986.00, 125…
## $ PARENT1 <fct> No, No, No, No, No, Yes, No, No, No, No, No, No, No, N…
## $ HOME_VAL <dbl> 0.0, 257252.0, 124191.0, 306251.0, 243925.0, 0.0, 1654…
## $ MSTATUS <fct> No, No, Yes, Yes, Yes, No, Yes, Yes, No, No, No, Yes, …
## $ SEX <fct> M, M, F, M, F, F, F, M, F, M, F, F, M, M, F, F, M, F, …
## $ EDUCATION <fct> PhD, High School, High School, <High School, PhD, Bach…
## $ JOB <fct> Professional, Blue Collar, Clerical, Blue Collar, Doct…
## $ TRAVTIME <dbl> 14, 22, 5, 32, 36, 46, 33, 44, 34, 48, 15, 36, 25, 64,…
## $ CAR_USE <fct> Private, Commercial, Private, Private, Private, Commer…
## $ BLUEBOOK <dbl> 14230, 14940, 4010, 15440, 18000, 17430, 8780, 16970, …
## $ TIF <dbl> 11, 1, 4, 7, 1, 1, 1, 1, 1, 7, 1, 7, 7, 6, 1, 6, 6, 7,…
## $ CAR_TYPE <fct> Minivan, Minivan, SUV, Minivan, SUV, Sports Car, SUV, …
## $ RED_CAR <fct> yes, yes, no, yes, no, no, no, yes, no, no, no, no, ye…
## $ OLDCLAIM <dbl> 4461, 0, 38690, 0, 19217, 0, 0, 2374, 0, 0, 0, 0, 5028…
## $ CLM_FREQ <ord> 2, 0, 2, 0, 2, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, …
## $ REVOKED <fct> No, No, No, No, Yes, No, No, Yes, No, No, No, No, Yes,…
## $ MVR_PTS <dbl> 3, 0, 3, 0, 3, 0, 0, 10, 0, 1, 0, 0, 3, 3, 3, 0, 0, 0,…
## $ CAR_AGE <dbl> 18, 1, 10, 6, 17, 7, 1, 7, 1, 17, 11, 1, 9, 10, 5, 13,…
## $ URBANICITY <fct> Highly Urban/ Urban, Highly Urban/ Urban, Highly Urban…
## $ TARGET_AMT_log <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 7.98…
## $ AGE_log <dbl> 4.110874, 3.784190, 3.583519, 3.951244, 3.931826, 3.55…
## $ YOJ_log <dbl> 2.484907, 2.484907, 2.397895, 2.708050, 2.513008, 2.56…
## $ INCOME_log <dbl> 11.117658, 11.423548, 9.682841, 11.115233, 11.652574, …
## $ HOME_VAL_log <dbl> 0.00000, 12.45782, 11.72958, 12.63216, 12.40462, 0.000…
## $ TRAVTIME_log <dbl> 2.708050, 3.135494, 1.791759, 3.496508, 3.610918, 3.85…
## $ BLUEBOOK_log <dbl> 9.563178, 9.611864, 8.296796, 9.644782, 9.798183, 9.76…
## $ TIF_log <dbl> 2.4849066, 0.6931472, 1.6094379, 2.0794415, 0.6931472,…
## $ OLDCLAIM_log <dbl> 8.403352, 0.000000, 10.563362, 0.000000, 9.863603, 0.0…
## $ MVR_PTS_log <dbl> 1.3862944, 0.0000000, 1.3862944, 0.0000000, 1.3862944,…
## $ CAR_AGE_log <dbl> 2.9444390, 0.6931472, 2.3978953, 1.9459101, 2.8903718,…
## $ TARGET_AMT_sqrt <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 54.27707,…
## $ AGE_sqrt <dbl> 7.745967, 6.557439, 5.916080, 7.141428, 7.071068, 5.83…
## $ YOJ_sqrt <dbl> 3.316625, 3.316625, 3.162278, 3.741657, 3.367789, 3.46…
## $ INCOME_sqrt <dbl> 259.5169, 302.4054, 126.6452, 259.2024, 339.0959, 353.…
## $ HOME_VAL_sqrt <dbl> 0.0000, 507.2002, 352.4074, 553.3995, 493.8876, 0.0000…
## $ TRAVTIME_sqrt <dbl> 3.741657, 4.690416, 2.236068, 5.656854, 6.000000, 6.78…
## $ BLUEBOOK_sqrt <dbl> 119.28956, 122.22929, 63.32456, 124.25780, 134.16408, …
## $ TIF_sqrt <dbl> 3.316625, 1.000000, 2.000000, 2.645751, 1.000000, 1.00…
## $ OLDCLAIM_sqrt <dbl> 66.79072, 0.00000, 196.69774, 0.00000, 138.62539, 0.00…
## $ MVR_PTS_sqrt <dbl> 1.732051, 0.000000, 1.732051, 0.000000, 1.732051, 0.00…
## $ CAR_AGE_sqrt <dbl> 4.242641, 1.000000, 3.162278, 2.449490, 4.123106, 2.64…
## $ TARGET_AMT_cbrt <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 14.33544,…
## $ AGE_cbrt <dbl> 3.914868, 3.503398, 3.271066, 3.708430, 3.684031, 3.23…
## $ YOJ_cbrt <dbl> 2.223980, 2.223980, 2.154435, 2.410142, 2.246794, 2.28…
## $ INCOME_cbrt <dbl> 40.68588, 45.05327, 25.21888, 40.65300, 48.62747, 50.0…
## $ HOME_VAL_cbrt <dbl> 0.00000, 63.59939, 49.89190, 67.40506, 62.48159, 0.000…
## $ TRAVTIME_cbrt <dbl> 2.410142, 2.802039, 1.709976, 3.174802, 3.301927, 3.58…
## $ BLUEBOOK_cbrt <dbl> 24.23269, 24.62919, 15.88723, 24.90094, 26.20741, 25.9…
## $ TIF_cbrt <dbl> 2.223980, 1.000000, 1.587401, 1.912931, 1.000000, 1.00…
## $ OLDCLAIM_cbrt <dbl> 16.46180, 0.00000, 33.82202, 0.00000, 26.78522, 0.0000…
## $ MVR_PTS_cbrt <dbl> 1.442250, 0.000000, 1.442250, 0.000000, 1.442250, 0.00…
## $ CAR_AGE_cbrt <dbl> 2.620741, 1.000000, 2.154435, 1.817121, 2.571282, 1.91…
df_ins$HV_INC_RATIO <- df_ins$HOME_VAL / df_ins$INCOME
df_ins$TRT_MVR_PRODUCT <- df_ins$TRAVTIME * df_ins$MVR_PTS
df_ins$HV_INC_RATIO[is.nan(df_ins$HV_INC_RATIO)] <- 0
df_ins$HV_INC_RATIO[is.infinite(df_ins$HV_INC_RATIO)] <- 0
# Impute data by regression:
df_ins <- mice(df_ins, method = "norm.predict", m = 1, remove.collinear=FALSE)
##
## iter imp variable
## 1 1 INCOME_log* HOME_VAL_log CAR_AGE_log* INCOME_sqrt* HOME_VAL_sqrt* CAR_AGE_sqrt* INCOME_cbrt* HOME_VAL_cbrt* CAR_AGE_cbrt*
## 2 1 INCOME_log HOME_VAL_log* CAR_AGE_log* INCOME_sqrt HOME_VAL_sqrt* CAR_AGE_sqrt* INCOME_cbrt* HOME_VAL_cbrt* CAR_AGE_cbrt*
## 3 1 INCOME_log* HOME_VAL_log* CAR_AGE_log* INCOME_sqrt HOME_VAL_sqrt* CAR_AGE_sqrt* INCOME_cbrt* HOME_VAL_cbrt* CAR_AGE_cbrt*
## 4 1 INCOME_log* HOME_VAL_log* CAR_AGE_log* INCOME_sqrt HOME_VAL_sqrt* CAR_AGE_sqrt* INCOME_cbrt* HOME_VAL_cbrt* CAR_AGE_cbrt*
## 5 1 INCOME_log* HOME_VAL_log* CAR_AGE_log* INCOME_sqrt HOME_VAL_sqrt* CAR_AGE_sqrt* INCOME_cbrt* HOME_VAL_cbrt* CAR_AGE_cbrt*
## Warning: Number of logged events: 84
df_ins <- complete(df_ins)
# Confirm no NAs remain
colSums(is.na(df_ins))
## TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS
## 0 0 0 0 0
## YOJ INCOME PARENT1 HOME_VAL MSTATUS
## 0 0 0 0 0
## SEX EDUCATION JOB TRAVTIME CAR_USE
## 0 0 0 0 0
## BLUEBOOK TIF CAR_TYPE RED_CAR OLDCLAIM
## 0 0 0 0 0
## CLM_FREQ REVOKED MVR_PTS CAR_AGE URBANICITY
## 0 0 0 0 0
## TARGET_AMT_log AGE_log YOJ_log INCOME_log HOME_VAL_log
## 0 0 0 0 0
## TRAVTIME_log BLUEBOOK_log TIF_log OLDCLAIM_log MVR_PTS_log
## 0 0 0 0 0
## CAR_AGE_log TARGET_AMT_sqrt AGE_sqrt YOJ_sqrt INCOME_sqrt
## 0 0 0 0 0
## HOME_VAL_sqrt TRAVTIME_sqrt BLUEBOOK_sqrt TIF_sqrt OLDCLAIM_sqrt
## 0 0 0 0 0
## MVR_PTS_sqrt CAR_AGE_sqrt TARGET_AMT_cbrt AGE_cbrt YOJ_cbrt
## 0 0 0 0 0
## INCOME_cbrt HOME_VAL_cbrt TRAVTIME_cbrt BLUEBOOK_cbrt TIF_cbrt
## 0 0 0 0 0
## OLDCLAIM_cbrt MVR_PTS_cbrt CAR_AGE_cbrt HV_INC_RATIO TRT_MVR_PRODUCT
## 0 0 0 0 0
# Visualize correlation between variables
corrplot(cor(df_ins_imp %>% keep(is.numeric)), method="shade", shade.col=NA, tl.col="black", tl.srt=45)
# Reshape correlation results
flattenCorrMatrix <- function(cormat, pmat) {
ut <- upper.tri(cormat)
data.frame(
row = rownames(cormat)[row(cormat)[ut]],
column = rownames(cormat)[col(cormat)[ut]],
cor =(cormat)[ut],
p = pmat[ut]
)
}
# Closer look at correlations of variables
corr_results <- rcorr(as.matrix(df_ins_imp %>% keep(is.numeric)))
df_corr <- flattenCorrMatrix(corr_results$r, corr_results$P)
# Noteworthy positive correlations
df_corr %>% filter(cor > 0.4)
## row column cor p
## 1 INCOME HOME_VAL 0.5905965 0
## 2 INCOME BLUEBOOK 0.4347134 0
## 3 INCOME CAR_AGE 0.4259203 0
# Noteworthy negative correlations
df_corr %>% filter(cor < -0.4)
## [1] row column cor p
## <0 rows> (or 0-length row.names)
# Pair plot
pairs(df_ins_imp %>% keep(is.numeric), lower.panel = NULL, col = "steelblue")
model <- glm(TARGET_AMT ~ KIDSDRIV + AGE + YOJ + INCOME + HOME_VAL + TRAVTIME + BLUEBOOK +
TIF + OLDCLAIM + MVR_PTS + CAR_AGE, data = df_ins, family = "quasipoisson")
vif(model)
## GVIF Df GVIF^(1/(2*Df))
## KIDSDRIV 1.024923 4 1.003082
## AGE 1.094996 1 1.046421
## YOJ 1.182087 1 1.087238
## INCOME 1.821521 1 1.349637
## HOME_VAL 1.330627 1 1.153528
## TRAVTIME 1.004598 1 1.002296
## BLUEBOOK 1.229380 1 1.108774
## TIF 1.004250 1 1.002123
## OLDCLAIM 1.080919 1 1.039673
## MVR_PTS 1.097635 1 1.047681
## CAR_AGE 1.252010 1 1.118933
# 0-.25, .25-.75, .75-1
df_ins$CAR_AGE_fact <- cut(x = df_ins$CAR_AGE, breaks = c(-4, 3.5, 12, 28), labels = c("New", "Moderate", "Old"))
# -.5, .5-.9, .9+
df_ins$HOME_VAL_fact <- cut(x = df_ins$HOME_VAL, breaks = c(-86567, 160953, 314151, 885283), labels = c("No or Low", "Moderate", "High"))
# 0-.25, .25-.75, .75-1
df_ins$INCOME_fact <- cut(x = df_ins$INCOME, breaks = c(-31969, 27940, 85472, 367031), labels = c("Low", "Moderate", "High"))
# 0-.5, .50-.75, .75-1
df_ins$MVR_PTS_fact <- cut(x = df_ins$MVR_PTS, breaks = c(-1, 1, 3, 14), labels = c("Low", "Moderate", "High"))
# 0-.75, .75-1
df_ins$OLDCLAIM_fact <- cut(x = df_ins$OLDCLAIM, breaks = c(-1, 4636, 57038), labels = c("Low", "High"))
# 0-.25, .25-.75, .75-1
df_ins$TIF_fact <- cut(x = df_ins$TIF, breaks = c(-1, 1, 7, 26), labels = c("Low", "Moderate", "High"))
# 0-.25, .25-.75, .75-1
df_ins$TRAVTIME_fact <- cut(x = df_ins$TRAVTIME, breaks = c(4, 22, 44, 143), labels = c("Short", "Moderate", "Long"))
# 0-.25, .25-.75, .75-1
df_ins$YOJ_fact <- cut(x = df_ins$YOJ, breaks = c(-1, 9, 13, 24), labels = c("Low", "Moderate", "High"))
gls_model <- gls(TARGET_AMT ~ AGE_sqrt + BLUEBOOK_cbrt + CAR_AGE_fact +
+ INCOME_fact + OLDCLAIM_fact + MVR_PTS_cbrt, data = df_ins)
summary(gls_model)
## Generalized least squares fit by REML
## Model: TARGET_AMT ~ AGE_sqrt + BLUEBOOK_cbrt + CAR_AGE_fact + +INCOME_fact + OLDCLAIM_fact + MVR_PTS_cbrt
## Data: df_ins
## AIC BIC logLik
## 160922.4 160992.4 -80451.18
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 2101.5065 576.2795 3.646679 0.0003
## AGE_sqrt -216.0004 81.0069 -2.666446 0.0077
## BLUEBOOK_cbrt 26.6804 11.9392 2.234699 0.0255
## CAR_AGE_factModerate -266.3347 127.5685 -2.087778 0.0368
## CAR_AGE_factOld -575.1674 157.8888 -3.642865 0.0003
## INCOME_factModerate -34.2304 131.3937 -0.260518 0.7945
## INCOME_factHigh -501.4788 167.2547 -2.998294 0.0027
## OLDCLAIM_factHigh 534.4255 124.9052 4.278649 0.0000
## MVR_PTS_cbrt 634.1423 74.1482 8.552366 0.0000
##
## Correlation:
## (Intr) AGE_sq BLUEBO CAR_AGE_M CAR_AGE_O INCOME_M INCOME_H
## AGE_sqrt -0.862
## BLUEBOOK_cbrt -0.353 -0.105
## CAR_AGE_factModerate -0.096 -0.030 -0.018
## CAR_AGE_factOld 0.021 -0.106 -0.034 0.583
## INCOME_factModerate 0.055 -0.098 -0.212 -0.068 -0.114
## INCOME_factHigh 0.134 -0.086 -0.327 -0.124 -0.308 0.612
## OLDCLAIM_factHigh -0.035 0.004 0.005 0.017 0.005 -0.004 0.029
## MVR_PTS_cbrt -0.141 0.058 0.015 -0.017 -0.018 -0.005 0.014
## OLDCLA
## AGE_sqrt
## BLUEBOOK_cbrt
## CAR_AGE_factModerate
## CAR_AGE_factOld
## INCOME_factModerate
## INCOME_factHigh
## OLDCLAIM_factHigh
## MVR_PTS_cbrt -0.298
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -0.72746367 -0.35897036 -0.22728322 -0.04457738 22.59403804
##
## Residual standard error: 4653.781
## Degrees of freedom: 8161 total; 8152 residual
plot(gls_model)