library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(psych)
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(ggplot2)
library(dplyr)
library(stargazer)
##
## Please cite as:
##
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
insur_train <- read.csv("~/Desktop/insurance_training_data.csv", na.strings = c(""), stringsAsFactors = FALSE)
insur_eval <- read.csv("~/Desktop/insurance-evaluation-data.csv", na.strings = c(""), stringsAsFactors = FALSE)
insur_train_clean <- insur_train[,-1] #index using base r to remove column
insur_eval_clean <- insur_eval[, -1]
sapply(X=insur_train_clean,
FUN= class)
## TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ
## "integer" "numeric" "integer" "integer" "integer" "integer"
## INCOME PARENT1 HOME_VAL MSTATUS SEX EDUCATION
## "character" "character" "character" "character" "character" "character"
## JOB TRAVTIME CAR_USE BLUEBOOK TIF CAR_TYPE
## "character" "integer" "character" "character" "integer" "character"
## RED_CAR OLDCLAIM CLM_FREQ REVOKED MVR_PTS CAR_AGE
## "character" "character" "integer" "character" "integer" "integer"
## URBANICITY
## "character"
insur_train_clean%>%
summarise(across(everything(),~ sum(is.na(.)))) %>%
glimpse()
## Rows: 1
## Columns: 25
## $ TARGET_FLAG <int> 0
## $ TARGET_AMT <int> 0
## $ KIDSDRIV <int> 0
## $ AGE <int> 6
## $ HOMEKIDS <int> 0
## $ YOJ <int> 454
## $ INCOME <int> 445
## $ PARENT1 <int> 0
## $ HOME_VAL <int> 464
## $ MSTATUS <int> 0
## $ SEX <int> 0
## $ EDUCATION <int> 0
## $ JOB <int> 526
## $ TRAVTIME <int> 0
## $ CAR_USE <int> 0
## $ BLUEBOOK <int> 0
## $ TIF <int> 0
## $ CAR_TYPE <int> 0
## $ RED_CAR <int> 0
## $ OLDCLAIM <int> 0
## $ CLM_FREQ <int> 0
## $ REVOKED <int> 0
## $ MVR_PTS <int> 0
## $ CAR_AGE <int> 510
## $ URBANICITY <int> 0
insur_train_clean <- na.omit(insur_train_clean)
N/A variables tend to be in the same rows. So I am omitting all of those rows with N/As.
Correlation of some variables I thought might have been highly correlated.
cor(insur_train_clean$TARGET_FLAG, insur_train_clean$AGE)
## [1] -0.1152745
cor(insur_train_clean$TARGET_FLAG, insur_train_clean$CAR_AGE)
## [1] -0.1102527
cor(insur_train_clean$TARGET_AMT, insur_train_clean$CAR_AGE)
## [1] -0.06961347
cor(insur_train_clean$TARGET_AMT, insur_train_clean$AGE)
## [1] -0.05654628
table(insur_train_clean$CAR_USE, insur_train_clean$TARGET_FLAG)
##
## 0 1
## Commercial 1297 743
## Private 3146 859
ggplot(data=insur_train_clean, aes(x=CAR_USE, y= TARGET_FLAG)) +
geom_bar(stat= "identity", width = 0.5, fill= "steelblue") +
geom_text(aes(label=TARGET_FLAG)) +
labs( x = "Car Use",
y = "Number of Crashes",
title = "Number of Crashes based on Type of Car Use")
There is not much of a difference in the amount of crashes commercial vehicles get in to versus the amount private cars get in to. However, it does make sense that commercial cars tend to get into less crashes. This is because they are probably getting paid to drive as part of their job, the cars are probably not theirs, and they might be driving long distances as part of their job. So these people may tend to be extra careful when driving.
table(insur_train_clean$EDUCATION, insur_train_clean$TARGET_FLAG)
##
## 0 1
## <High School 642 313
## Bachelors 1338 402
## Masters 881 180
## PhD 345 72
## z_High School 1237 635
ggplot(data=insur_train_clean, aes(x=EDUCATION, y= TARGET_FLAG)) +
geom_bar(stat= "identity", width = 0.5, fill= "steelblue") +
geom_text(aes(label=TARGET_FLAG)) +
labs(x= "Level of Education",
y= "Number of Crashes",
title= "Number of Crashes Based Level of Education")
Those who only have high school education probably do not have the typical 9-5 corporate job. They may be out on the roads more as part of their job. Or, they may just not have a job and have more time on their hands to drive around. Some of these people could also be right out of high school, when teenagers tend to be reckless. It makes sense that people with a PhD get in the least amount of crashes because they are probably working or studying most of their days.
table(insur_train_clean$JOB, insur_train_clean$TARGET_FLAG)
##
## 0 1
## Clerical 719 312
## Doctor 173 27
## Home Maker 351 133
## Lawyer 543 127
## Manager 680 99
## Professional 678 190
## Student 337 200
## z_Blue Collar 962 514
ggplot(data=insur_train_clean, aes(x=JOB, y= TARGET_FLAG)) +
geom_bar(stat= "identity", width = 0.5, fill= "steelblue") +
geom_text(aes(label=TARGET_FLAG)) +
labs(x= "Occupation",
y= "Number of Crashes",
title= "Number of Crashes Based on Occupation")
Someone who is a blue collar worker may have to drive as part of their job, as where a doctor or manager does not. Blue collar workers might be driving longer distances and their job hours may be shorter. Whereas a doctor is probably in the hospital for many hours a day and does not drive that often.
library(stringr)
insur_train_clean <- insur_train_clean %>%
mutate(INCOME=as.numeric(str_replace_all(INCOME,'\\$|,',''))) %>%
mutate(HOME_VAL=as.numeric(str_replace_all(HOME_VAL,'\\$|,',''))) %>%
mutate(BLUEBOOK=as.numeric(str_replace_all(BLUEBOOK,'\\$|,',''))) %>%
mutate(OLDCLAIM=as.numeric(str_replace_all(OLDCLAIM,'\\$|,','')))
insur_eval_clean <- insur_eval_clean %>%
mutate(INCOME=as.numeric(str_replace_all(INCOME,'\\$|,',''))) %>%
mutate(HOME_VAL=as.numeric(str_replace_all(HOME_VAL,'\\$|,',''))) %>%
mutate(BLUEBOOK=as.numeric(str_replace_all(BLUEBOOK,'\\$|,',''))) %>%
mutate(OLDCLAIM=as.numeric(str_replace_all(OLDCLAIM,'\\$|,','')))
Dummy variables are either 0 or 1. You use dummy variables when the answer is yes/no. The five variables below, that I made dummy variables, are those that had a yes/no answer. The benefit of using dummy variables is that it makes the variable numeric. These variables can now be shown in my summary stats.
library(fastDummies)
insur_train_clean.d <- dummy_cols(insur_train_clean,
select_columns =
c('PARENT1', 'MSTATUS',
'RED_CAR', 'REVOKED', 'URBANICITY'),
remove_selected_columns = TRUE)
insur_eval_clean.d <- dummy_cols(insur_train_clean,
select_columns =
c('PARENT1', 'MSTATUS',
'RED_CAR', 'REVOKED', 'URBANICITY'),
remove_selected_columns = TRUE)
labels <- c(
'Crash',
'Cost (if crash occured)',
'Number of Kids Driving',
'Age',
'Number of Kids at Home',
'Years on Job',
'Income',
'Home Value',
'Distance to Work',
'Value of Vehicle',
'Time in Force',
'Total Old Claims',
'Claims Filed',
'Motor Vehicle Record Points',
'Vehicle Age',
'Not a Single Parent',
'Single Parent',
'Married',
'Not Married',
'Not a Red Car',
'Red Car',
'License not Revoked',
'License Revoked',
'Urban Area',
'Rural Area')
stargazer(insur_train_clean.d,
type = "text", # html, latex
# out =
# summary.stat =
covariate.labels = labels,
digits = 2)
##
## =======================================================================
## Statistic N Mean St. Dev. Min Max
## -----------------------------------------------------------------------
## Crash 6,045 0.27 0.44 0 1
## Cost (if crash occured) 6,045 1,479.66 4,553.17 0.00 85,523.65
## Number of Kids Driving 6,045 0.17 0.52 0 4
## Age 6,045 44.63 8.71 16 81
## Number of Kids at Home 6,045 0.74 1.13 0 5
## Years on Job 6,045 10.49 4.14 0 23
## Income 6,045 58,177.01 43,826.98 0 367,030
## Home Value 6,045 150,102.10 123,728.70 0 885,282
## Distance to Work 6,045 33.69 15.89 5 142
## Value of Vehicle 6,045 15,235.61 8,040.96 1,500 65,970
## Time in Force 6,045 5.36 4.14 1 25
## Total Old Claims 6,045 4,004.88 8,822.51 0 57,037
## Claims Filed 6,045 0.78 1.15 0 5
## Motor Vehicle Record Points 6,045 1.70 2.16 0 13
## Vehicle Age 6,045 7.92 5.58 -3 28
## Not a Single Parent 6,045 0.86 0.34 0 1
## Single Parent 6,045 0.14 0.34 0 1
## Married 6,045 0.60 0.49 0 1
## Not Married 6,045 0.40 0.49 0 1
## Not a Red Car 6,045 0.72 0.45 0 1
## Red Car 6,045 0.28 0.45 0 1
## License not Revoked 6,045 0.88 0.33 0 1
## License Revoked 6,045 0.12 0.33 0 1
## Urban Area 6,045 0.78 0.41 0 1
## Rural Area 6,045 0.22 0.41 0 1
## -----------------------------------------------------------------------
I was interested to see that the mean age was around 45 years old. I wonder how the data would look if the mean age was older or younger. There would probably be more crashes if the mean age was younger (or even significantly older for that matter). The average amount of claims filed in the past five years for this population is .78, and the average motor vehicle record points is 1.70. This leads me to believe that this groups may be relatively safe drivers.
c <- cor(insur_train_clean.d [, c(2,25:30)])
library(corrplot)
## corrplot 0.92 loaded
corrplot(c, type = "upper")
model1 <- lm(data = insur_train_clean.d,
TARGET_AMT ~ . -RED_CAR_no -PARENT1_Yes -MSTATUS_z_No -REVOKED_Yes -`URBANICITY_z_Highly Rural/ Rural` -TARGET_FLAG)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
vif(model1)
## GVIF Df GVIF^(1/(2*Df))
## KIDSDRIV 1.313594 1 1.146121
## AGE 1.492547 1 1.221699
## HOMEKIDS 2.112021 1 1.453279
## YOJ 1.467479 1 1.211395
## INCOME 3.150506 1 1.774966
## HOME_VAL 2.454136 1 1.566568
## SEX 3.221519 1 1.794859
## EDUCATION 11.263320 4 1.353500
## JOB 22.073019 7 1.247355
## TRAVTIME 1.036178 1 1.017928
## CAR_USE 2.353409 1 1.534082
## BLUEBOOK 1.872914 1 1.368544
## TIF 1.008654 1 1.004318
## CAR_TYPE 4.501908 5 1.162357
## OLDCLAIM 1.704900 1 1.305718
## CLM_FREQ 1.607648 1 1.267931
## MVR_PTS 1.236336 1 1.111907
## CAR_AGE 2.046281 1 1.430483
## PARENT1_No 1.856170 1 1.362413
## MSTATUS_Yes 2.109977 1 1.452576
## RED_CAR_yes 1.850907 1 1.360480
## REVOKED_No 1.295934 1 1.138391
## `URBANICITY_Highly Urban/ Urban` 1.251364 1 1.118644
I eliminated those specific variables because when I would do summary stats, they would come up as n.a. So I ran a correlation of my data and saw that these variables were coming up as n.a because they were highly co-linear to its counterpart.
model2 <- lm(data = insur_train_clean.d,
TARGET_AMT ~ . -RED_CAR_no -PARENT1_Yes -MSTATUS_z_No -REVOKED_Yes -`URBANICITY_z_Highly Rural/ Rural` -EDUCATION -JOB -TARGET_FLAG)
Eliminated education and job because of multi-colinearity
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
model3 <- stepAIC(object = lm(data = insur_train_clean.d,
TARGET_AMT ~ . -TARGET_FLAG),
direction = c("backward")
) #used backward selection
## Start: AIC=101432.9
## TARGET_AMT ~ (TARGET_FLAG + KIDSDRIV + AGE + HOMEKIDS + YOJ +
## INCOME + HOME_VAL + SEX + EDUCATION + JOB + TRAVTIME + CAR_USE +
## BLUEBOOK + TIF + CAR_TYPE + OLDCLAIM + CLM_FREQ + MVR_PTS +
## CAR_AGE + PARENT1_No + PARENT1_Yes + MSTATUS_Yes + MSTATUS_z_No +
## RED_CAR_no + RED_CAR_yes + REVOKED_No + REVOKED_Yes + `URBANICITY_Highly Urban/ Urban` +
## `URBANICITY_z_Highly Rural/ Rural`) - TARGET_FLAG
##
##
## Step: AIC=101432.9
## TARGET_AMT ~ KIDSDRIV + AGE + HOMEKIDS + YOJ + INCOME + HOME_VAL +
## SEX + EDUCATION + JOB + TRAVTIME + CAR_USE + BLUEBOOK + TIF +
## CAR_TYPE + OLDCLAIM + CLM_FREQ + MVR_PTS + CAR_AGE + PARENT1_No +
## PARENT1_Yes + MSTATUS_Yes + MSTATUS_z_No + RED_CAR_no + RED_CAR_yes +
## REVOKED_No + REVOKED_Yes + `URBANICITY_Highly Urban/ Urban`
##
##
## Step: AIC=101432.9
## TARGET_AMT ~ KIDSDRIV + AGE + HOMEKIDS + YOJ + INCOME + HOME_VAL +
## SEX + EDUCATION + JOB + TRAVTIME + CAR_USE + BLUEBOOK + TIF +
## CAR_TYPE + OLDCLAIM + CLM_FREQ + MVR_PTS + CAR_AGE + PARENT1_No +
## PARENT1_Yes + MSTATUS_Yes + MSTATUS_z_No + RED_CAR_no + RED_CAR_yes +
## REVOKED_No + `URBANICITY_Highly Urban/ Urban`
##
##
## Step: AIC=101432.9
## TARGET_AMT ~ KIDSDRIV + AGE + HOMEKIDS + YOJ + INCOME + HOME_VAL +
## SEX + EDUCATION + JOB + TRAVTIME + CAR_USE + BLUEBOOK + TIF +
## CAR_TYPE + OLDCLAIM + CLM_FREQ + MVR_PTS + CAR_AGE + PARENT1_No +
## PARENT1_Yes + MSTATUS_Yes + MSTATUS_z_No + RED_CAR_no + REVOKED_No +
## `URBANICITY_Highly Urban/ Urban`
##
##
## Step: AIC=101432.9
## TARGET_AMT ~ KIDSDRIV + AGE + HOMEKIDS + YOJ + INCOME + HOME_VAL +
## SEX + EDUCATION + JOB + TRAVTIME + CAR_USE + BLUEBOOK + TIF +
## CAR_TYPE + OLDCLAIM + CLM_FREQ + MVR_PTS + CAR_AGE + PARENT1_No +
## PARENT1_Yes + MSTATUS_Yes + RED_CAR_no + REVOKED_No + `URBANICITY_Highly Urban/ Urban`
##
##
## Step: AIC=101432.9
## TARGET_AMT ~ KIDSDRIV + AGE + HOMEKIDS + YOJ + INCOME + HOME_VAL +
## SEX + EDUCATION + JOB + TRAVTIME + CAR_USE + BLUEBOOK + TIF +
## CAR_TYPE + OLDCLAIM + CLM_FREQ + MVR_PTS + CAR_AGE + PARENT1_No +
## MSTATUS_Yes + RED_CAR_no + REVOKED_No + `URBANICITY_Highly Urban/ Urban`
##
## Df Sum of Sq RSS AIC
## - YOJ 1 351556 1.1571e+11 101431
## - HOMEKIDS 1 896426 1.1571e+11 101431
## - AGE 1 5868923 1.1572e+11 101431
## - OLDCLAIM 1 6544464 1.1572e+11 101431
## - RED_CAR_no 1 18824464 1.1573e+11 101432
## - INCOME 1 31562132 1.1575e+11 101433
## - CLM_FREQ 1 31640073 1.1575e+11 101433
## - BLUEBOOK 1 33101062 1.1575e+11 101433
## <none> 1.1571e+11 101433
## - HOME_VAL 1 41502176 1.1576e+11 101433
## - EDUCATION 4 161597962 1.1588e+11 101433
## - KIDSDRIV 1 52170833 1.1577e+11 101434
## - CAR_AGE 1 59033415 1.1577e+11 101434
## - PARENT1_No 1 84127414 1.1580e+11 101435
## - SEX 1 102980401 1.1582e+11 101436
## - REVOKED_No 1 124659787 1.1584e+11 101437
## - TRAVTIME 1 173886142 1.1589e+11 101440
## - MSTATUS_Yes 1 202041858 1.1592e+11 101441
## - TIF 1 204543749 1.1592e+11 101442
## - CAR_USE 1 328063504 1.1604e+11 101448
## - JOB 7 677813229 1.1639e+11 101454
## - CAR_TYPE 5 701858743 1.1642e+11 101459
## - MVR_PTS 1 679930613 1.1639e+11 101466
## - `URBANICITY_Highly Urban/ Urban` 1 2195481766 1.1791e+11 101545
##
## Step: AIC=101430.9
## TARGET_AMT ~ KIDSDRIV + AGE + HOMEKIDS + INCOME + HOME_VAL +
## SEX + EDUCATION + JOB + TRAVTIME + CAR_USE + BLUEBOOK + TIF +
## CAR_TYPE + OLDCLAIM + CLM_FREQ + MVR_PTS + CAR_AGE + PARENT1_No +
## MSTATUS_Yes + RED_CAR_no + REVOKED_No + `URBANICITY_Highly Urban/ Urban`
##
## Df Sum of Sq RSS AIC
## - HOMEKIDS 1 709162 1.1571e+11 101429
## - AGE 1 6626898 1.1572e+11 101429
## - OLDCLAIM 1 6628961 1.1572e+11 101429
## - RED_CAR_no 1 18852982 1.1573e+11 101430
## - CLM_FREQ 1 31758506 1.1575e+11 101431
## - INCOME 1 32314466 1.1575e+11 101431
## - BLUEBOOK 1 33080536 1.1575e+11 101431
## <none> 1.1571e+11 101431
## - HOME_VAL 1 41487801 1.1576e+11 101431
## - EDUCATION 4 161967684 1.1588e+11 101431
## - KIDSDRIV 1 52695492 1.1577e+11 101432
## - CAR_AGE 1 59011572 1.1577e+11 101432
## - PARENT1_No 1 84333238 1.1580e+11 101433
## - SEX 1 103113408 1.1582e+11 101434
## - REVOKED_No 1 124836799 1.1584e+11 101435
## - TRAVTIME 1 173726188 1.1589e+11 101438
## - TIF 1 204732923 1.1592e+11 101440
## - MSTATUS_Yes 1 205027759 1.1592e+11 101440
## - CAR_USE 1 328574414 1.1604e+11 101446
## - JOB 7 679641790 1.1639e+11 101452
## - CAR_TYPE 5 703295192 1.1642e+11 101458
## - MVR_PTS 1 680841708 1.1639e+11 101464
## - `URBANICITY_Highly Urban/ Urban` 1 2195131212 1.1791e+11 101543
##
## Step: AIC=101429
## TARGET_AMT ~ KIDSDRIV + AGE + INCOME + HOME_VAL + SEX + EDUCATION +
## JOB + TRAVTIME + CAR_USE + BLUEBOOK + TIF + CAR_TYPE + OLDCLAIM +
## CLM_FREQ + MVR_PTS + CAR_AGE + PARENT1_No + MSTATUS_Yes +
## RED_CAR_no + REVOKED_No + `URBANICITY_Highly Urban/ Urban`
##
## Df Sum of Sq RSS AIC
## - OLDCLAIM 1 6593452 1.1572e+11 101427
## - AGE 1 9376881 1.1572e+11 101427
## - RED_CAR_no 1 18825907 1.1573e+11 101428
## - CLM_FREQ 1 31754169 1.1575e+11 101429
## - INCOME 1 31998738 1.1575e+11 101429
## - BLUEBOOK 1 33220175 1.1575e+11 101429
## <none> 1.1571e+11 101429
## - HOME_VAL 1 41847940 1.1576e+11 101429
## - EDUCATION 4 162853225 1.1588e+11 101429
## - CAR_AGE 1 59169562 1.1577e+11 101430
## - KIDSDRIV 1 70513453 1.1579e+11 101431
## - SEX 1 102991203 1.1582e+11 101432
## - PARENT1_No 1 109747410 1.1582e+11 101433
## - REVOKED_No 1 125018867 1.1584e+11 101433
## - TRAVTIME 1 173346976 1.1589e+11 101436
## - TIF 1 204467384 1.1592e+11 101438
## - MSTATUS_Yes 1 213380590 1.1593e+11 101438
## - CAR_USE 1 328956111 1.1604e+11 101444
## - JOB 7 679673483 1.1639e+11 101450
## - CAR_TYPE 5 706136197 1.1642e+11 101456
## - MVR_PTS 1 681042854 1.1640e+11 101462
## - `URBANICITY_Highly Urban/ Urban` 1 2194627368 1.1791e+11 101541
##
## Step: AIC=101427.3
## TARGET_AMT ~ KIDSDRIV + AGE + INCOME + HOME_VAL + SEX + EDUCATION +
## JOB + TRAVTIME + CAR_USE + BLUEBOOK + TIF + CAR_TYPE + CLM_FREQ +
## MVR_PTS + CAR_AGE + PARENT1_No + MSTATUS_Yes + RED_CAR_no +
## REVOKED_No + `URBANICITY_Highly Urban/ Urban`
##
## Df Sum of Sq RSS AIC
## - AGE 1 9496700 1.1573e+11 101426
## - RED_CAR_no 1 18956277 1.1574e+11 101426
## - CLM_FREQ 1 25160725 1.1575e+11 101427
## - INCOME 1 32104527 1.1575e+11 101427
## - BLUEBOOK 1 33214709 1.1575e+11 101427
## <none> 1.1572e+11 101427
## - HOME_VAL 1 42208097 1.1576e+11 101428
## - EDUCATION 4 163012416 1.1588e+11 101428
## - CAR_AGE 1 58974407 1.1578e+11 101428
## - KIDSDRIV 1 71368155 1.1579e+11 101429
## - SEX 1 103856545 1.1583e+11 101431
## - PARENT1_No 1 110362036 1.1583e+11 101431
## - REVOKED_No 1 126806659 1.1585e+11 101432
## - TRAVTIME 1 175530579 1.1590e+11 101434
## - TIF 1 205216640 1.1593e+11 101436
## - MSTATUS_Yes 1 213272987 1.1593e+11 101436
## - CAR_USE 1 329642166 1.1605e+11 101443
## - JOB 7 680837686 1.1640e+11 101449
## - CAR_TYPE 5 707135458 1.1643e+11 101454
## - MVR_PTS 1 674459473 1.1640e+11 101460
## - `URBANICITY_Highly Urban/ Urban` 1 2196176268 1.1792e+11 101539
##
## Step: AIC=101425.8
## TARGET_AMT ~ KIDSDRIV + INCOME + HOME_VAL + SEX + EDUCATION +
## JOB + TRAVTIME + CAR_USE + BLUEBOOK + TIF + CAR_TYPE + CLM_FREQ +
## MVR_PTS + CAR_AGE + PARENT1_No + MSTATUS_Yes + RED_CAR_no +
## REVOKED_No + `URBANICITY_Highly Urban/ Urban`
##
## Df Sum of Sq RSS AIC
## - RED_CAR_no 1 18272014 1.1575e+11 101425
## - CLM_FREQ 1 24664667 1.1576e+11 101425
## - BLUEBOOK 1 28813026 1.1576e+11 101425
## - INCOME 1 30025417 1.1576e+11 101425
## <none> 1.1573e+11 101426
## - EDUCATION 4 161207343 1.1589e+11 101426
## - HOME_VAL 1 46304526 1.1578e+11 101426
## - CAR_AGE 1 60004338 1.1579e+11 101427
## - KIDSDRIV 1 70286166 1.1580e+11 101427
## - SEX 1 97412578 1.1583e+11 101429
## - REVOKED_No 1 127894375 1.1586e+11 101430
## - PARENT1_No 1 140005485 1.1587e+11 101431
## - TRAVTIME 1 174217504 1.1590e+11 101433
## - TIF 1 205093505 1.1594e+11 101435
## - MSTATUS_Yes 1 207097688 1.1594e+11 101435
## - CAR_USE 1 330695298 1.1606e+11 101441
## - JOB 7 685431673 1.1642e+11 101448
## - CAR_TYPE 5 697976097 1.1643e+11 101452
## - MVR_PTS 1 681418683 1.1641e+11 101459
## - `URBANICITY_Highly Urban/ Urban` 1 2203840642 1.1793e+11 101538
##
## Step: AIC=101424.8
## TARGET_AMT ~ KIDSDRIV + INCOME + HOME_VAL + SEX + EDUCATION +
## JOB + TRAVTIME + CAR_USE + BLUEBOOK + TIF + CAR_TYPE + CLM_FREQ +
## MVR_PTS + CAR_AGE + PARENT1_No + MSTATUS_Yes + REVOKED_No +
## `URBANICITY_Highly Urban/ Urban`
##
## Df Sum of Sq RSS AIC
## - CLM_FREQ 1 23956329 1.1577e+11 101424
## - INCOME 1 30254178 1.1578e+11 101424
## - BLUEBOOK 1 30521205 1.1578e+11 101424
## <none> 1.1575e+11 101425
## - HOME_VAL 1 44880251 1.1579e+11 101425
## - EDUCATION 4 163322639 1.1591e+11 101425
## - CAR_AGE 1 60520141 1.1581e+11 101426
## - KIDSDRIV 1 70732988 1.1582e+11 101426
## - SEX 1 79511727 1.1583e+11 101427
## - REVOKED_No 1 126885211 1.1588e+11 101429
## - PARENT1_No 1 140730652 1.1589e+11 101430
## - TRAVTIME 1 173608071 1.1592e+11 101432
## - TIF 1 205474008 1.1595e+11 101433
## - MSTATUS_Yes 1 207122826 1.1596e+11 101434
## - CAR_USE 1 327908821 1.1608e+11 101440
## - JOB 7 689640136 1.1644e+11 101447
## - CAR_TYPE 5 706525174 1.1646e+11 101452
## - MVR_PTS 1 680422210 1.1643e+11 101458
## - `URBANICITY_Highly Urban/ Urban` 1 2199102531 1.1795e+11 101537
##
## Step: AIC=101424
## TARGET_AMT ~ KIDSDRIV + INCOME + HOME_VAL + SEX + EDUCATION +
## JOB + TRAVTIME + CAR_USE + BLUEBOOK + TIF + CAR_TYPE + MVR_PTS +
## CAR_AGE + PARENT1_No + MSTATUS_Yes + REVOKED_No + `URBANICITY_Highly Urban/ Urban`
##
## Df Sum of Sq RSS AIC
## - BLUEBOOK 1 29596008 1.1580e+11 101424
## - INCOME 1 31216401 1.1580e+11 101424
## <none> 1.1577e+11 101424
## - HOME_VAL 1 46864840 1.1582e+11 101424
## - EDUCATION 4 163446543 1.1594e+11 101425
## - CAR_AGE 1 60527935 1.1583e+11 101425
## - KIDSDRIV 1 73065439 1.1585e+11 101426
## - SEX 1 82269674 1.1586e+11 101426
## - REVOKED_No 1 129665521 1.1590e+11 101429
## - PARENT1_No 1 139933748 1.1591e+11 101429
## - TRAVTIME 1 179501344 1.1595e+11 101431
## - TIF 1 207364442 1.1598e+11 101433
## - MSTATUS_Yes 1 211992738 1.1598e+11 101433
## - CAR_USE 1 336641614 1.1611e+11 101440
## - JOB 7 692555117 1.1647e+11 101446
## - CAR_TYPE 5 726874745 1.1650e+11 101452
## - MVR_PTS 1 882053637 1.1665e+11 101468
## - `URBANICITY_Highly Urban/ Urban` 1 2419672505 1.1819e+11 101547
##
## Step: AIC=101423.6
## TARGET_AMT ~ KIDSDRIV + INCOME + HOME_VAL + SEX + EDUCATION +
## JOB + TRAVTIME + CAR_USE + TIF + CAR_TYPE + MVR_PTS + CAR_AGE +
## PARENT1_No + MSTATUS_Yes + REVOKED_No + `URBANICITY_Highly Urban/ Urban`
##
## Df Sum of Sq RSS AIC
## - INCOME 1 22919033 1.1583e+11 101423
## <none> 1.1580e+11 101424
## - EDUCATION 4 161017781 1.1596e+11 101424
## - HOME_VAL 1 46298975 1.1585e+11 101424
## - SEX 1 57222491 1.1586e+11 101425
## - CAR_AGE 1 60948228 1.1586e+11 101425
## - KIDSDRIV 1 73724835 1.1588e+11 101425
## - REVOKED_No 1 129663744 1.1593e+11 101428
## - PARENT1_No 1 137304720 1.1594e+11 101429
## - TRAVTIME 1 181307621 1.1598e+11 101431
## - TIF 1 206084010 1.1601e+11 101432
## - MSTATUS_Yes 1 210296361 1.1601e+11 101433
## - CAR_USE 1 338968629 1.1614e+11 101439
## - JOB 7 690165033 1.1649e+11 101445
## - CAR_TYPE 5 697296271 1.1650e+11 101450
## - MVR_PTS 1 872200399 1.1667e+11 101467
## - `URBANICITY_Highly Urban/ Urban` 1 2419270022 1.1822e+11 101547
##
## Step: AIC=101422.8
## TARGET_AMT ~ KIDSDRIV + HOME_VAL + SEX + EDUCATION + JOB + TRAVTIME +
## CAR_USE + TIF + CAR_TYPE + MVR_PTS + CAR_AGE + PARENT1_No +
## MSTATUS_Yes + REVOKED_No + `URBANICITY_Highly Urban/ Urban`
##
## Df Sum of Sq RSS AIC
## <none> 1.1583e+11 101423
## - EDUCATION 4 153670907 1.1598e+11 101423
## - CAR_AGE 1 62093976 1.1589e+11 101424
## - SEX 1 66340374 1.1589e+11 101424
## - KIDSDRIV 1 71139964 1.1590e+11 101424
## - HOME_VAL 1 107710327 1.1593e+11 101426
## - REVOKED_No 1 129311580 1.1595e+11 101428
## - PARENT1_No 1 139715547 1.1597e+11 101428
## - TRAVTIME 1 180088157 1.1601e+11 101430
## - MSTATUS_Yes 1 188276349 1.1601e+11 101431
## - TIF 1 206076086 1.1603e+11 101432
## - CAR_USE 1 335947722 1.1616e+11 101438
## - JOB 7 711659101 1.1654e+11 101446
## - CAR_TYPE 5 711291671 1.1654e+11 101450
## - MVR_PTS 1 884896051 1.1671e+11 101467
## - `URBANICITY_Highly Urban/ Urban` 1 2412424760 1.1824e+11 101545
stargazer(model1, model2, model3,
type = "text"
) #summary stats of each of the linear models
##
## ==============================================================================================================
## Dependent variable:
## -----------------------------------------------------------------------------
## TARGET_AMT
## (1) (2) (3)
## --------------------------------------------------------------------------------------------------------------
## KIDSDRIV 206.594* 212.654* 218.208*
## (125.525) (125.562) (113.517)
##
## AGE -4.372 -6.301
## (7.920) (7.811)
##
## HOMEKIDS 15.625 21.378
## (72.424) (71.762)
##
## YOJ -2.232 5.743
## (16.524) (14.944)
##
## INCOME -0.003 -0.004**
## (0.002) (0.002)
##
## HOME_VAL -0.001 -0.001 -0.001**
## (0.001) (0.001) (0.001)
##
## SEXz_F -471.463** -451.198** -300.608*
## (203.891) (202.442) (161.942)
##
## EDUCATIONBachelors -362.997 -403.359*
## (226.602) (223.245)
##
## EDUCATIONMasters -303.552 -369.819
## (339.569) (334.665)
##
## EDUCATIONPhD 437.621 310.084
## (426.167) (410.704)
##
## EDUCATIONz_High School -221.312 -238.057
## (186.963) (186.204)
##
## JOBDoctor -1,191.010** -1,265.235***
## (495.837) (489.641)
##
## JOBHome Maker -96.718 -30.327
## (279.862) (257.757)
##
## JOBLawyer -68.964 -127.456
## (335.856) (331.297)
##
## JOBManager -958.360*** -1,028.263***
## (260.469) (253.514)
##
## JOBProfessional 128.638 75.220
## (237.174) (231.962)
##
## JOBStudent -276.939 -245.726
## (264.625) (248.793)
##
## JOBz_Blue Collar 156.855 118.662
## (209.903) (206.701)
##
## TRAVTIME 10.864*** 11.628*** 11.035***
## (3.616) (3.619) (3.608)
##
## CAR_USEPrivate -755.804*** -867.408*** -763.519***
## (183.129) (139.336) (182.781)
##
## BLUEBOOK 0.013 0.012
## (0.010) (0.010)
##
## TIF -44.577*** -43.756*** -44.730***
## (13.679) (13.693) (13.672)
##
## CAR_TYPEPanel Truck 475.550 386.274 639.777**
## (328.741) (315.181) (299.782)
##
## CAR_TYPEPickup 405.725** 363.335** 388.787**
## (187.946) (184.355) (186.416)
##
## CAR_TYPESports Car 1,264.272*** 1,225.516*** 1,194.577***
## (236.653) (237.135) (221.692)
##
## CAR_TYPEVan 489.190** 495.477** 543.162**
## (241.837) (239.452) (235.256)
##
## CAR_TYPEz_SUV 910.320*** 885.654*** 850.619***
## (194.784) (195.049) (179.335)
##
## OLDCLAIM -0.005 -0.005
## (0.008) (0.008)
##
## CLM_FREQ 79.504 85.876
## (62.029) (62.150)
##
## MVR_PTS 172.825*** 181.544*** 183.338***
## (29.087) (29.070) (27.043)
##
## CAR_AGE -25.318* -37.451*** -25.954*
## (14.461) (11.360) (14.452)
##
## PARENT1_No -468.900** -452.925** -527.718***
## (224.356) (224.678) (195.897)
##
## MSTATUS_Yes -540.981*** -572.276*** -481.786***
## (167.028) (165.943) (154.065)
##
## RED_CAR_yes -169.136 -193.872
## (171.081) (171.367)
##
## REVOKED_No -497.034** -516.310*** -449.324***
## (195.367) (195.498) (173.376)
##
## `URBANICITY_Highly Urban/ Urban` 1,640.836*** 1,505.856*** 1,676.540***
## (153.684) (150.295) (149.774)
##
## Constant 1,998.163*** 1,988.602*** 1,787.354***
## (568.905) (505.544) (411.610)
##
## --------------------------------------------------------------------------------------------------------------
## Observations 6,045 6,045 6,045
## R2 0.077 0.070 0.076
## Adjusted R2 0.071 0.066 0.071
## Residual Std. Error 4,388.612 (df = 6008) 4,399.793 (df = 6019) 4,387.813 (df = 6016)
## F Statistic 13.827*** (df = 36; 6008) 18.150*** (df = 25; 6019) 17.576*** (df = 28; 6016)
## ==============================================================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
Age- If there is a one year increase in age, then there is a -4.372 dollar decrease in the amount spent if there was a crash all else being equal. This may make sense because as someone gets older, they may be less reckless when driving.
Job type- Having your job be a doctor is negatively correlated (-1,265.235) with the amount of money you would have to pay if you were in a crash. This makes sense because doctors tend to work a lot and be smart people, so they will be less likely to get in an accident and therefore have to pay money because of it. On the other hand blue collar workes
Motor vehicle points- If there is a one point increase in motor vehical points, then there is a 183.338 dollar increase in the amount spent if there was a crash, all else being equal. This makes sense because you gain motor vehicle points by getting tickets for speeding or reckless driving. So if someone has more points, that probably means they are not as safe of a driver as someone who does not have any points. And they will be more likely to get in a crash.
Travel time- If there is a one (minute or hour?) increase in travel time to work, then there is a 11.035 dollar increase in the amount spent if there was a crash, all else being equal. This makes sense because if someone drives more to work, they are spending more time on the road, so they may be more likely to get in a crash.
R2 - Usually, the larger the R2, the better the regression model fits your observations. Between the three models, there is not one that is significantly better than the others. However, model 1 explains 7.7% of the observations and model 3 explains 7.6% of the observations.
F stat - This is the ratio of two variances. The F stat for each of the models is highly significant.
Residual standard error- The smaller the residual standard error, the better the regression model fits the data set.The residual standard error is a bit lower for model 3 (4,387) than it is for model 2 and model 1.
Model 1 contains jobs and education which were highly multicollinear, so I will not be going with that one. Because of the adjusted r2 and the stepAIC function, I am going to go with model 3.
plot(model3)
model_logit <- glm(formula = TARGET_FLAG ~ . ,
data = insur_train_clean.d,
family = binomial (link = "logit"))
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model_logit)
##
## Call:
## glm(formula = TARGET_FLAG ~ ., family = binomial(link = "logit"),
## data = insur_train_clean.d)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.417e-03 -2.000e-08 -2.000e-08 2.000e-08 3.016e-03
##
## Coefficients: (5 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.360e+01 4.380e+03 -0.008 0.994
## TARGET_AMT 2.650e-01 3.433e+00 0.077 0.938
## KIDSDRIV 5.912e+00 1.019e+03 0.006 0.995
## AGE -3.261e-01 3.630e+01 -0.009 0.993
## HOMEKIDS -1.506e+00 8.913e+02 -0.002 0.999
## YOJ -9.020e-01 1.210e+02 -0.007 0.994
## INCOME 1.876e-05 2.329e-02 0.001 0.999
## HOME_VAL 4.109e-05 5.654e-03 0.007 0.994
## SEXz_F -3.347e+00 2.720e+03 -0.001 0.999
## EDUCATIONBachelors -3.468e+00 2.564e+03 -0.001 0.999
## EDUCATIONMasters 1.441e+01 1.194e+04 0.001 0.999
## EDUCATIONPhD 4.679e+00 8.190e+03 0.001 1.000
## EDUCATIONz_High School 1.434e+01 1.665e+03 0.009 0.993
## JOBDoctor -2.039e+02 2.976e+05 -0.001 0.999
## JOBHome Maker -1.215e+01 4.126e+03 -0.003 0.998
## JOBLawyer -5.558e+00 1.212e+04 0.000 1.000
## JOBManager -2.807e-01 5.761e+03 0.000 1.000
## JOBProfessional -5.576e+00 3.433e+04 0.000 1.000
## JOBStudent 1.136e+01 2.619e+03 0.004 0.997
## JOBz_Blue Collar 1.617e+01 2.654e+03 0.006 0.995
## TRAVTIME -2.008e-01 2.951e+01 -0.007 0.995
## CAR_USEPrivate -3.083e+00 1.314e+03 -0.002 0.998
## BLUEBOOK -1.111e-03 1.378e-01 -0.008 0.994
## TIF -1.120e+00 1.261e+02 -0.009 0.993
## CAR_TYPEPanel Truck 1.219e+01 3.499e+04 0.000 1.000
## CAR_TYPEPickup 1.359e+01 1.038e+03 0.013 0.990
## CAR_TYPESports Car 5.215e+00 2.975e+03 0.002 0.999
## CAR_TYPEVan -3.290e+00 4.275e+03 -0.001 0.999
## CAR_TYPEz_SUV -1.400e+01 5.675e+03 -0.002 0.998
## OLDCLAIM -7.916e-04 6.893e-02 -0.011 0.991
## CLM_FREQ 7.199e+00 4.179e+02 0.017 0.986
## MVR_PTS -2.031e-01 1.682e+02 -0.001 0.999
## CAR_AGE 1.834e-01 1.529e+02 0.001 0.999
## PARENT1_No 9.552e+00 2.272e+03 0.004 0.997
## PARENT1_Yes NA NA NA NA
## MSTATUS_Yes -9.312e+00 8.186e+02 -0.011 0.991
## MSTATUS_z_No NA NA NA NA
## RED_CAR_no 4.338e+00 9.682e+02 0.004 0.996
## RED_CAR_yes NA NA NA NA
## REVOKED_No -1.933e+00 1.534e+03 -0.001 0.999
## REVOKED_Yes NA NA NA NA
## `URBANICITY_Highly Urban/ Urban` 2.318e+00 8.182e+02 0.003 0.998
## `URBANICITY_z_Highly Rural/ Rural` NA NA NA NA
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6.9909e+03 on 6044 degrees of freedom
## Residual deviance: 4.5966e-05 on 6007 degrees of freedom
## AIC: 76
##
## Number of Fisher Scoring iterations: 25
model_logit2 <- stepAIC(object = glm(formula = TARGET_FLAG ~ . -TARGET_AMT,
data = insur_train_clean.d,
family = binomial (link = "logit")),
direction = c("backward"))
## Start: AIC=5436.27
## TARGET_FLAG ~ (TARGET_AMT + KIDSDRIV + AGE + HOMEKIDS + YOJ +
## INCOME + HOME_VAL + SEX + EDUCATION + JOB + TRAVTIME + CAR_USE +
## BLUEBOOK + TIF + CAR_TYPE + OLDCLAIM + CLM_FREQ + MVR_PTS +
## CAR_AGE + PARENT1_No + PARENT1_Yes + MSTATUS_Yes + MSTATUS_z_No +
## RED_CAR_no + RED_CAR_yes + REVOKED_No + REVOKED_Yes + `URBANICITY_Highly Urban/ Urban` +
## `URBANICITY_z_Highly Rural/ Rural`) - TARGET_AMT
##
##
## Step: AIC=5436.27
## TARGET_FLAG ~ KIDSDRIV + AGE + HOMEKIDS + YOJ + INCOME + HOME_VAL +
## SEX + EDUCATION + JOB + TRAVTIME + CAR_USE + BLUEBOOK + TIF +
## CAR_TYPE + OLDCLAIM + CLM_FREQ + MVR_PTS + CAR_AGE + PARENT1_No +
## PARENT1_Yes + MSTATUS_Yes + MSTATUS_z_No + RED_CAR_no + RED_CAR_yes +
## REVOKED_No + REVOKED_Yes + `URBANICITY_Highly Urban/ Urban`
##
##
## Step: AIC=5436.27
## TARGET_FLAG ~ KIDSDRIV + AGE + HOMEKIDS + YOJ + INCOME + HOME_VAL +
## SEX + EDUCATION + JOB + TRAVTIME + CAR_USE + BLUEBOOK + TIF +
## CAR_TYPE + OLDCLAIM + CLM_FREQ + MVR_PTS + CAR_AGE + PARENT1_No +
## PARENT1_Yes + MSTATUS_Yes + MSTATUS_z_No + RED_CAR_no + RED_CAR_yes +
## REVOKED_No + `URBANICITY_Highly Urban/ Urban`
##
##
## Step: AIC=5436.27
## TARGET_FLAG ~ KIDSDRIV + AGE + HOMEKIDS + YOJ + INCOME + HOME_VAL +
## SEX + EDUCATION + JOB + TRAVTIME + CAR_USE + BLUEBOOK + TIF +
## CAR_TYPE + OLDCLAIM + CLM_FREQ + MVR_PTS + CAR_AGE + PARENT1_No +
## PARENT1_Yes + MSTATUS_Yes + MSTATUS_z_No + RED_CAR_no + REVOKED_No +
## `URBANICITY_Highly Urban/ Urban`
##
##
## Step: AIC=5436.27
## TARGET_FLAG ~ KIDSDRIV + AGE + HOMEKIDS + YOJ + INCOME + HOME_VAL +
## SEX + EDUCATION + JOB + TRAVTIME + CAR_USE + BLUEBOOK + TIF +
## CAR_TYPE + OLDCLAIM + CLM_FREQ + MVR_PTS + CAR_AGE + PARENT1_No +
## PARENT1_Yes + MSTATUS_Yes + RED_CAR_no + REVOKED_No + `URBANICITY_Highly Urban/ Urban`
##
##
## Step: AIC=5436.27
## TARGET_FLAG ~ KIDSDRIV + AGE + HOMEKIDS + YOJ + INCOME + HOME_VAL +
## SEX + EDUCATION + JOB + TRAVTIME + CAR_USE + BLUEBOOK + TIF +
## CAR_TYPE + OLDCLAIM + CLM_FREQ + MVR_PTS + CAR_AGE + PARENT1_No +
## MSTATUS_Yes + RED_CAR_no + REVOKED_No + `URBANICITY_Highly Urban/ Urban`
##
## Df Deviance AIC
## - CAR_AGE 1 5362.5 5434.5
## - HOMEKIDS 1 5362.6 5434.6
## - AGE 1 5363.0 5435.0
## - YOJ 1 5363.2 5435.2
## <none> 5362.3 5436.3
## - SEX 1 5364.9 5436.9
## - RED_CAR_no 1 5367.1 5439.1
## - INCOME 1 5368.1 5440.1
## - OLDCLAIM 1 5370.6 5442.6
## - PARENT1_No 1 5372.8 5444.8
## - HOME_VAL 1 5373.3 5445.3
## - EDUCATION 4 5382.1 5448.1
## - BLUEBOOK 1 5376.2 5448.2
## - MSTATUS_Yes 1 5380.1 5452.1
## - KIDSDRIV 1 5382.6 5454.6
## - CLM_FREQ 1 5398.3 5470.3
## - TIF 1 5401.0 5473.0
## - JOB 7 5425.0 5485.0
## - TRAVTIME 1 5413.7 5485.7
## - MVR_PTS 1 5416.3 5488.3
## - REVOKED_No 1 5424.1 5496.1
## - CAR_USE 1 5424.5 5496.5
## - CAR_TYPE 5 5444.9 5508.9
## - `URBANICITY_Highly Urban/ Urban` 1 5842.8 5914.8
##
## Step: AIC=5434.47
## TARGET_FLAG ~ KIDSDRIV + AGE + HOMEKIDS + YOJ + INCOME + HOME_VAL +
## SEX + EDUCATION + JOB + TRAVTIME + CAR_USE + BLUEBOOK + TIF +
## CAR_TYPE + OLDCLAIM + CLM_FREQ + MVR_PTS + PARENT1_No + MSTATUS_Yes +
## RED_CAR_no + REVOKED_No + `URBANICITY_Highly Urban/ Urban`
##
## Df Deviance AIC
## - HOMEKIDS 1 5362.8 5432.8
## - AGE 1 5363.2 5433.2
## - YOJ 1 5363.4 5433.4
## <none> 5362.5 5434.5
## - SEX 1 5365.1 5435.1
## - RED_CAR_no 1 5367.3 5437.3
## - INCOME 1 5368.4 5438.4
## - OLDCLAIM 1 5370.8 5440.8
## - PARENT1_No 1 5373.0 5443.0
## - HOME_VAL 1 5373.3 5443.3
## - BLUEBOOK 1 5376.3 5446.3
## - MSTATUS_Yes 1 5380.3 5450.3
## - EDUCATION 4 5387.3 5451.3
## - KIDSDRIV 1 5382.8 5452.8
## - CLM_FREQ 1 5398.4 5468.4
## - TIF 1 5401.3 5471.3
## - JOB 7 5425.2 5483.2
## - TRAVTIME 1 5413.8 5483.8
## - MVR_PTS 1 5416.5 5486.5
## - REVOKED_No 1 5424.2 5494.2
## - CAR_USE 1 5424.6 5494.6
## - CAR_TYPE 5 5445.2 5507.2
## - `URBANICITY_Highly Urban/ Urban` 1 5843.0 5913.0
##
## Step: AIC=5432.76
## TARGET_FLAG ~ KIDSDRIV + AGE + YOJ + INCOME + HOME_VAL + SEX +
## EDUCATION + JOB + TRAVTIME + CAR_USE + BLUEBOOK + TIF + CAR_TYPE +
## OLDCLAIM + CLM_FREQ + MVR_PTS + PARENT1_No + MSTATUS_Yes +
## RED_CAR_no + REVOKED_No + `URBANICITY_Highly Urban/ Urban`
##
## Df Deviance AIC
## - YOJ 1 5363.5 5431.5
## - AGE 1 5364.0 5432.0
## <none> 5362.8 5432.8
## - SEX 1 5365.4 5433.4
## - RED_CAR_no 1 5367.6 5435.6
## - INCOME 1 5368.6 5436.6
## - OLDCLAIM 1 5371.1 5439.1
## - HOME_VAL 1 5373.8 5441.8
## - BLUEBOOK 1 5376.6 5444.6
## - PARENT1_No 1 5377.6 5445.6
## - MSTATUS_Yes 1 5380.7 5448.7
## - EDUCATION 4 5387.8 5449.8
## - KIDSDRIV 1 5390.0 5458.0
## - CLM_FREQ 1 5398.8 5466.8
## - TIF 1 5401.6 5469.6
## - JOB 7 5425.4 5481.4
## - TRAVTIME 1 5414.0 5482.0
## - MVR_PTS 1 5416.9 5484.9
## - REVOKED_No 1 5424.8 5492.8
## - CAR_USE 1 5425.1 5493.1
## - CAR_TYPE 5 5445.7 5505.7
## - `URBANICITY_Highly Urban/ Urban` 1 5843.1 5911.1
##
## Step: AIC=5431.49
## TARGET_FLAG ~ KIDSDRIV + AGE + INCOME + HOME_VAL + SEX + EDUCATION +
## JOB + TRAVTIME + CAR_USE + BLUEBOOK + TIF + CAR_TYPE + OLDCLAIM +
## CLM_FREQ + MVR_PTS + PARENT1_No + MSTATUS_Yes + RED_CAR_no +
## REVOKED_No + `URBANICITY_Highly Urban/ Urban`
##
## Df Deviance AIC
## - AGE 1 5365.0 5431.0
## <none> 5363.5 5431.5
## - SEX 1 5366.1 5432.1
## - RED_CAR_no 1 5368.3 5434.3
## - INCOME 1 5369.8 5435.8
## - OLDCLAIM 1 5372.0 5438.0
## - HOME_VAL 1 5374.5 5440.5
## - BLUEBOOK 1 5377.4 5443.4
## - PARENT1_No 1 5377.9 5443.9
## - EDUCATION 4 5388.3 5448.3
## - MSTATUS_Yes 1 5382.8 5448.8
## - KIDSDRIV 1 5390.4 5456.4
## - CLM_FREQ 1 5399.6 5465.6
## - TIF 1 5402.6 5468.6
## - JOB 7 5425.5 5479.5
## - TRAVTIME 1 5414.6 5480.6
## - MVR_PTS 1 5417.9 5483.9
## - REVOKED_No 1 5425.6 5491.6
## - CAR_USE 1 5426.2 5492.2
## - CAR_TYPE 5 5446.8 5504.8
## - `URBANICITY_Highly Urban/ Urban` 1 5843.5 5909.5
##
## Step: AIC=5431
## TARGET_FLAG ~ KIDSDRIV + INCOME + HOME_VAL + SEX + EDUCATION +
## JOB + TRAVTIME + CAR_USE + BLUEBOOK + TIF + CAR_TYPE + OLDCLAIM +
## CLM_FREQ + MVR_PTS + PARENT1_No + MSTATUS_Yes + RED_CAR_no +
## REVOKED_No + `URBANICITY_Highly Urban/ Urban`
##
## Df Deviance AIC
## <none> 5365.0 5431.0
## - SEX 1 5367.1 5431.1
## - RED_CAR_no 1 5369.7 5433.7
## - INCOME 1 5371.0 5435.0
## - OLDCLAIM 1 5373.6 5437.6
## - HOME_VAL 1 5376.9 5440.9
## - BLUEBOOK 1 5380.7 5444.7
## - MSTATUS_Yes 1 5383.5 5447.5
## - PARENT1_No 1 5383.9 5447.9
## - EDUCATION 4 5390.0 5448.0
## - KIDSDRIV 1 5391.4 5455.4
## - CLM_FREQ 1 5401.0 5465.0
## - TIF 1 5404.0 5468.0
## - TRAVTIME 1 5415.8 5479.8
## - JOB 7 5428.4 5480.4
## - MVR_PTS 1 5420.1 5484.1
## - REVOKED_No 1 5427.4 5491.4
## - CAR_USE 1 5427.8 5491.8
## - CAR_TYPE 5 5447.1 5503.1
## - `URBANICITY_Highly Urban/ Urban` 1 5846.8 5910.8
summary(model_logit2)
##
## Call:
## glm(formula = TARGET_FLAG ~ KIDSDRIV + INCOME + HOME_VAL + SEX +
## EDUCATION + JOB + TRAVTIME + CAR_USE + BLUEBOOK + TIF + CAR_TYPE +
## OLDCLAIM + CLM_FREQ + MVR_PTS + PARENT1_No + MSTATUS_Yes +
## RED_CAR_no + REVOKED_No + `URBANICITY_Highly Urban/ Urban`,
## family = binomial(link = "logit"), data = insur_train_clean.d)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5691 -0.7024 -0.3901 0.6201 3.1495
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.180e+00 2.635e-01 -4.477 7.58e-06 ***
## KIDSDRIV 3.307e-01 6.390e-02 5.176 2.27e-07 ***
## INCOME -3.471e-06 1.424e-06 -2.439 0.014744 *
## HOME_VAL -1.469e-06 4.267e-07 -3.443 0.000575 ***
## SEXz_F -1.861e-01 1.272e-01 -1.464 0.143274
## EDUCATIONBachelors -3.995e-01 1.231e-01 -3.245 0.001174 **
## EDUCATIONMasters -4.969e-01 1.905e-01 -2.609 0.009094 **
## EDUCATIONPhD 4.064e-02 2.461e-01 0.165 0.868809
## EDUCATIONz_High School -7.396e-03 1.063e-01 -0.070 0.944554
## JOBDoctor -9.037e-01 3.273e-01 -2.761 0.005755 **
## JOBHome Maker -2.872e-01 1.569e-01 -1.831 0.067098 .
## JOBLawyer -1.650e-01 2.112e-01 -0.781 0.434854
## JOBManager -1.091e+00 1.641e-01 -6.650 2.93e-11 ***
## JOBProfessional -2.958e-01 1.410e-01 -2.098 0.035938 *
## JOBStudent -3.133e-01 1.446e-01 -2.166 0.030305 *
## JOBz_Blue Collar -2.020e-01 1.201e-01 -1.682 0.092562 .
## TRAVTIME 1.557e-02 2.189e-03 7.114 1.13e-12 ***
## CAR_USEPrivate -8.324e-01 1.059e-01 -7.861 3.82e-15 ***
## BLUEBOOK -2.370e-05 6.031e-06 -3.930 8.51e-05 ***
## TIF -5.237e-02 8.537e-03 -6.134 8.56e-10 ***
## CAR_TYPEPanel Truck 7.121e-01 1.944e-01 3.664 0.000249 ***
## CAR_TYPEPickup 5.525e-01 1.153e-01 4.793 1.65e-06 ***
## CAR_TYPESports Car 1.090e+00 1.451e-01 7.512 5.84e-14 ***
## CAR_TYPEVan 5.778e-01 1.494e-01 3.866 0.000110 ***
## CAR_TYPEz_SUV 8.112e-01 1.247e-01 6.508 7.63e-11 ***
## OLDCLAIM -1.326e-05 4.569e-06 -2.903 0.003700 **
## CLM_FREQ 2.001e-01 3.318e-02 6.032 1.62e-09 ***
## MVR_PTS 1.170e-01 1.585e-02 7.383 1.54e-13 ***
## PARENT1_No -4.730e-01 1.089e-01 -4.342 1.41e-05 ***
## MSTATUS_Yes -4.146e-01 9.568e-02 -4.334 1.47e-05 ***
## RED_CAR_no 2.236e-01 1.031e-01 2.168 0.030141 *
## REVOKED_No -8.551e-01 1.074e-01 -7.965 1.65e-15 ***
## `URBANICITY_Highly Urban/ Urban` 2.308e+00 1.244e-01 18.552 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6990.9 on 6044 degrees of freedom
## Residual deviance: 5365.0 on 6012 degrees of freedom
## AIC: 5431
##
## Number of Fisher Scoring iterations: 5
model_probit <- (object = glm(formula = TARGET_FLAG ~ . -RED_CAR_no -PARENT1_Yes -MSTATUS_z_No -REVOKED_Yes -`URBANICITY_z_Highly Rural/ Rural` -TARGET_AMT,
data = insur_train_clean.d ,
family = binomial (link = "probit")))
summary(model_probit)
##
## Call:
## glm(formula = TARGET_FLAG ~ . - RED_CAR_no - PARENT1_Yes - MSTATUS_z_No -
## REVOKED_Yes - `URBANICITY_z_Highly Rural/ Rural` - TARGET_AMT,
## family = binomial(link = "probit"), data = insur_train_clean.d)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6127 -0.7222 -0.3921 0.6492 3.4816
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.353e-01 1.949e-01 -2.233 0.025530 *
## KIDSDRIV 1.808e-01 4.107e-02 4.403 1.07e-05 ***
## AGE -1.972e-03 2.706e-03 -0.729 0.466202
## HOMEKIDS 1.650e-02 2.474e-02 0.667 0.504891
## YOJ -5.094e-03 5.652e-03 -0.901 0.367459
## INCOME -1.921e-06 8.131e-07 -2.363 0.018148 *
## HOME_VAL -7.368e-07 2.456e-07 -3.001 0.002693 **
## SEXz_F -1.234e-01 7.334e-02 -1.682 0.092507 .
## EDUCATIONBachelors -2.183e-01 7.684e-02 -2.841 0.004493 **
## EDUCATIONMasters -2.423e-01 1.216e-01 -1.993 0.046240 *
## EDUCATIONPhD 5.496e-02 1.502e-01 0.366 0.714393
## EDUCATIONz_High School 4.212e-03 6.225e-02 0.068 0.946052
## JOBDoctor -5.337e-01 1.831e-01 -2.915 0.003560 **
## JOBHome Maker -1.900e-01 9.660e-02 -1.967 0.049160 *
## JOBLawyer -1.006e-01 1.202e-01 -0.837 0.402664
## JOBManager -6.055e-01 9.278e-02 -6.527 6.73e-11 ***
## JOBProfessional -1.614e-01 8.186e-02 -1.972 0.048613 *
## JOBStudent -1.837e-01 8.868e-02 -2.072 0.038311 *
## JOBz_Blue Collar -1.102e-01 7.016e-02 -1.571 0.116228
## TRAVTIME 9.025e-03 1.255e-03 7.194 6.31e-13 ***
## CAR_USEPrivate -4.705e-01 6.145e-02 -7.657 1.91e-14 ***
## BLUEBOOK -1.268e-05 3.477e-06 -3.647 0.000265 ***
## TIF -3.099e-02 4.880e-03 -6.350 2.16e-10 ***
## CAR_TYPEPanel Truck 3.930e-01 1.125e-01 3.492 0.000479 ***
## CAR_TYPEPickup 3.112e-01 6.596e-02 4.717 2.39e-06 ***
## CAR_TYPESports Car 6.396e-01 8.368e-02 7.644 2.11e-14 ***
## CAR_TYPEVan 3.162e-01 8.574e-02 3.688 0.000226 ***
## CAR_TYPEz_SUV 4.728e-01 7.123e-02 6.638 3.19e-11 ***
## OLDCLAIM -6.992e-06 2.670e-06 -2.619 0.008832 **
## CLM_FREQ 1.196e-01 1.954e-02 6.122 9.25e-10 ***
## MVR_PTS 6.721e-02 9.311e-03 7.219 5.25e-13 ***
## CAR_AGE -2.018e-03 5.089e-03 -0.397 0.691664
## PARENT1_No -2.345e-01 7.370e-02 -3.181 0.001466 **
## MSTATUS_Yes -2.550e-01 5.784e-02 -4.408 1.04e-05 ***
## RED_CAR_yes -1.229e-01 5.935e-02 -2.071 0.038332 *
## REVOKED_No -4.869e-01 6.261e-02 -7.776 7.46e-15 ***
## `URBANICITY_Highly Urban/ Urban` 1.264e+00 6.482e-02 19.508 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6990.9 on 6044 degrees of freedom
## Residual deviance: 5374.6 on 6008 degrees of freedom
## AIC: 5448.6
##
## Number of Fisher Scoring iterations: 5
stargazer(model_logit, model_logit2, model_probit,
type = "text")
##
## ========================================================================
## Dependent variable:
## -------------------------------------
## TARGET_FLAG
## logistic probit
## (1) (2) (3)
## ------------------------------------------------------------------------
## TARGET_AMT 0.265
## (3.433)
##
## KIDSDRIV 5.912 0.331*** 0.181***
## (1,018.797) (0.064) (0.041)
##
## AGE -0.326 -0.002
## (36.302) (0.003)
##
## HOMEKIDS -1.506 0.016
## (891.280) (0.025)
##
## YOJ -0.902 -0.005
## (120.996) (0.006)
##
## INCOME 0.00002 -0.00000** -0.00000**
## (0.023) (0.00000) (0.00000)
##
## HOME_VAL 0.00004 -0.00000*** -0.00000***
## (0.006) (0.00000) (0.00000)
##
## SEXz_F -3.347 -0.186 -0.123*
## (2,719.828) (0.127) (0.073)
##
## EDUCATIONBachelors -3.468 -0.399*** -0.218***
## (2,564.242) (0.123) (0.077)
##
## EDUCATIONMasters 14.409 -0.497*** -0.242**
## (11,936.760) (0.190) (0.122)
##
## EDUCATIONPhD 4.679 0.041 0.055
## (8,189.922) (0.246) (0.150)
##
## EDUCATIONz_High School 14.337 -0.007 0.004
## (1,665.372) (0.106) (0.062)
##
## JOBDoctor -203.858 -0.904*** -0.534***
## (297,612.500) (0.327) (0.183)
##
## JOBHome Maker -12.154 -0.287* -0.190**
## (4,125.562) (0.157) (0.097)
##
## JOBLawyer -5.558 -0.165 -0.101
## (12,118.490) (0.211) (0.120)
##
## JOBManager -0.281 -1.091*** -0.606***
## (5,761.114) (0.164) (0.093)
##
## JOBProfessional -5.576 -0.296** -0.161**
## (34,325.550) (0.141) (0.082)
##
## JOBStudent 11.360 -0.313** -0.184**
## (2,619.123) (0.145) (0.089)
##
## JOBz_Blue Collar 16.174 -0.202* -0.110
## (2,653.715) (0.120) (0.070)
##
## TRAVTIME -0.201 0.016*** 0.009***
## (29.505) (0.002) (0.001)
##
## CAR_USEPrivate -3.083 -0.832*** -0.471***
## (1,314.093) (0.106) (0.061)
##
## BLUEBOOK -0.001 -0.00002*** -0.00001***
## (0.138) (0.00001) (0.00000)
##
## TIF -1.120 -0.052*** -0.031***
## (126.129) (0.009) (0.005)
##
## CAR_TYPEPanel Truck 12.190 0.712*** 0.393***
## (34,992.480) (0.194) (0.113)
##
## CAR_TYPEPickup 13.589 0.552*** 0.311***
## (1,038.148) (0.115) (0.066)
##
## CAR_TYPESports Car 5.215 1.090*** 0.640***
## (2,975.369) (0.145) (0.084)
##
## CAR_TYPEVan -3.290 0.578*** 0.316***
## (4,274.571) (0.149) (0.086)
##
## CAR_TYPEz_SUV -14.003 0.811*** 0.473***
## (5,675.033) (0.125) (0.071)
##
## OLDCLAIM -0.001 -0.00001*** -0.00001***
## (0.069) (0.00000) (0.00000)
##
## CLM_FREQ 7.199 0.200*** 0.120***
## (417.858) (0.033) (0.020)
##
## MVR_PTS -0.203 0.117*** 0.067***
## (168.248) (0.016) (0.009)
##
## CAR_AGE 0.183 -0.002
## (152.948) (0.005)
##
## PARENT1_No 9.552 -0.473*** -0.234***
## (2,272.261) (0.109) (0.074)
##
## PARENT1_Yes
##
##
## MSTATUS_Yes -9.312 -0.415*** -0.255***
## (818.561) (0.096) (0.058)
##
## MSTATUS_z_No
##
##
## RED_CAR_no 4.338 0.224**
## (968.203) (0.103)
##
## RED_CAR_yes -0.123**
## (0.059)
##
## REVOKED_No -1.933 -0.855*** -0.487***
## (1,533.957) (0.107) (0.063)
##
## REVOKED_Yes
##
##
## `URBANICITY_Highly Urban/ Urban` 2.318 2.308*** 1.264***
## (818.160) (0.124) (0.065)
##
## `URBANICITY_z_Highly Rural/ Rural`
##
##
## Constant -33.601 -1.180*** -0.435**
## (4,380.121) (0.264) (0.195)
##
## ------------------------------------------------------------------------
## Observations 6,045 6,045 6,045
## Log Likelihood -0.00002 -2,682.500 -2,687.279
## Akaike Inf. Crit. 76.000 5,431.000 5,448.557
## ========================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
exp(-.326)
## [1] 0.7218052
exp(.016)
## [1] 1.016129
exp(-0.399)
## [1] 0.6709907
exp(-14.003)
## [1] 8.290379e-07
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
summary(model_logit)
##
## Call:
## glm(formula = TARGET_FLAG ~ ., family = binomial(link = "logit"),
## data = insur_train_clean.d)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.417e-03 -2.000e-08 -2.000e-08 2.000e-08 3.016e-03
##
## Coefficients: (5 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.360e+01 4.380e+03 -0.008 0.994
## TARGET_AMT 2.650e-01 3.433e+00 0.077 0.938
## KIDSDRIV 5.912e+00 1.019e+03 0.006 0.995
## AGE -3.261e-01 3.630e+01 -0.009 0.993
## HOMEKIDS -1.506e+00 8.913e+02 -0.002 0.999
## YOJ -9.020e-01 1.210e+02 -0.007 0.994
## INCOME 1.876e-05 2.329e-02 0.001 0.999
## HOME_VAL 4.109e-05 5.654e-03 0.007 0.994
## SEXz_F -3.347e+00 2.720e+03 -0.001 0.999
## EDUCATIONBachelors -3.468e+00 2.564e+03 -0.001 0.999
## EDUCATIONMasters 1.441e+01 1.194e+04 0.001 0.999
## EDUCATIONPhD 4.679e+00 8.190e+03 0.001 1.000
## EDUCATIONz_High School 1.434e+01 1.665e+03 0.009 0.993
## JOBDoctor -2.039e+02 2.976e+05 -0.001 0.999
## JOBHome Maker -1.215e+01 4.126e+03 -0.003 0.998
## JOBLawyer -5.558e+00 1.212e+04 0.000 1.000
## JOBManager -2.807e-01 5.761e+03 0.000 1.000
## JOBProfessional -5.576e+00 3.433e+04 0.000 1.000
## JOBStudent 1.136e+01 2.619e+03 0.004 0.997
## JOBz_Blue Collar 1.617e+01 2.654e+03 0.006 0.995
## TRAVTIME -2.008e-01 2.951e+01 -0.007 0.995
## CAR_USEPrivate -3.083e+00 1.314e+03 -0.002 0.998
## BLUEBOOK -1.111e-03 1.378e-01 -0.008 0.994
## TIF -1.120e+00 1.261e+02 -0.009 0.993
## CAR_TYPEPanel Truck 1.219e+01 3.499e+04 0.000 1.000
## CAR_TYPEPickup 1.359e+01 1.038e+03 0.013 0.990
## CAR_TYPESports Car 5.215e+00 2.975e+03 0.002 0.999
## CAR_TYPEVan -3.290e+00 4.275e+03 -0.001 0.999
## CAR_TYPEz_SUV -1.400e+01 5.675e+03 -0.002 0.998
## OLDCLAIM -7.916e-04 6.893e-02 -0.011 0.991
## CLM_FREQ 7.199e+00 4.179e+02 0.017 0.986
## MVR_PTS -2.031e-01 1.682e+02 -0.001 0.999
## CAR_AGE 1.834e-01 1.529e+02 0.001 0.999
## PARENT1_No 9.552e+00 2.272e+03 0.004 0.997
## PARENT1_Yes NA NA NA NA
## MSTATUS_Yes -9.312e+00 8.186e+02 -0.011 0.991
## MSTATUS_z_No NA NA NA NA
## RED_CAR_no 4.338e+00 9.682e+02 0.004 0.996
## RED_CAR_yes NA NA NA NA
## REVOKED_No -1.933e+00 1.534e+03 -0.001 0.999
## REVOKED_Yes NA NA NA NA
## `URBANICITY_Highly Urban/ Urban` 2.318e+00 8.182e+02 0.003 0.998
## `URBANICITY_z_Highly Rural/ Rural` NA NA NA NA
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6.9909e+03 on 6044 degrees of freedom
## Residual deviance: 4.5966e-05 on 6007 degrees of freedom
## AIC: 76
##
## Number of Fisher Scoring iterations: 25
predicted <- predict(model_logit, data = insur_train_clean.d, type="response")
insur_train_clean.d$predicted <- predict(model_logit, data = insur_train_clean.d, type="response")
predicted_binary <- ifelse(test = insur_train_clean.d$predicted>.5,yes = 1,no = 0)
insur_train_clean.d$predicted_binary <- ifelse(test = insur_train_clean.d$predicted>.5,yes = 1,no = 0)
table(insur_train_clean.d$predicted_binary)
##
## 0 1
## 4443 1602
table(insur_train_clean.d$TARGET_FLAG)
##
## 0 1
## 4443 1602
table(insur_train_clean.d$TARGET_FLAG, insur_train_clean.d$predicted_binary)
##
## 0 1
## 0 4443 0
## 1 0 1602
confusionMatrix(reference = factor(insur_train_clean.d$TARGET_FLAG),
data = factor(insur_train_clean.d$predicted_binary),
positive = "1"
)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4443 0
## 1 0 1602
##
## Accuracy : 1
## 95% CI : (0.9994, 1)
## No Information Rate : 0.735
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.000
## Specificity : 1.000
## Pos Pred Value : 1.000
## Neg Pred Value : 1.000
## Prevalence : 0.265
## Detection Rate : 0.265
## Detection Prevalence : 0.265
## Balanced Accuracy : 1.000
##
## 'Positive' Class : 1
##
In terms of the classification error rate (0), accuracy(1), sensitivity(1) and specificity(1), this is a perfect fitting model. However, this could be for a couple of different reasons. This model could be over fitting to the actual data set. In addition, a confusion matrix is usually done with the test data set. This confusion matrix was done with the training data set, and there were no errors.
insurance_predictions <- insur_eval_clean.d %>%
mutate(predictions = predict(model_logit, newdata= insur_eval_clean.d ))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `predictions = predict(model_logit, newdata =
## insur_eval_clean.d)`.
## Caused by warning in `predict.lm()`:
## ! prediction from a rank-deficient fit may be misleading
head(insurance_predictions)
## TARGET_FLAG TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL SEX
## 1 0 0 0 60 0 11 67349 0 M
## 2 0 0 0 43 0 11 91449 257252 M
## 3 0 0 0 35 1 10 16039 124191 z_F
## 4 1 2946 0 34 1 12 125301 0 z_F
## 5 1 2501 0 34 0 10 62978 0 z_F
## 6 0 0 0 50 0 7 106952 0 M
## EDUCATION JOB TRAVTIME CAR_USE BLUEBOOK TIF CAR_TYPE
## 1 PhD Professional 14 Private 14230 11 Minivan
## 2 z_High School z_Blue Collar 22 Commercial 14940 1 Minivan
## 3 z_High School Clerical 5 Private 4010 4 z_SUV
## 4 Bachelors z_Blue Collar 46 Commercial 17430 1 Sports Car
## 5 Bachelors Clerical 34 Private 11200 1 z_SUV
## 6 Bachelors Professional 48 Commercial 18510 7 Van
## OLDCLAIM CLM_FREQ MVR_PTS CAR_AGE PARENT1_No PARENT1_Yes MSTATUS_Yes
## 1 4461 2 3 18 1 0 0
## 2 0 0 0 1 1 0 0
## 3 38690 2 3 10 1 0 1
## 4 0 0 0 7 0 1 0
## 5 0 0 0 1 1 0 0
## 6 0 0 1 17 1 0 0
## MSTATUS_z_No RED_CAR_no RED_CAR_yes REVOKED_No REVOKED_Yes
## 1 1 0 1 1 0
## 2 1 0 1 1 0
## 3 0 1 0 1 0
## 4 1 1 0 1 0
## 5 1 1 0 1 0
## 6 1 1 0 1 0
## URBANICITY_Highly Urban/ Urban URBANICITY_z_Highly Rural/ Rural predictions
## 1 1 0 -73.25617
## 2 1 0 -26.76305
## 3 1 0 -76.21662
## 4 1 0 716.75139
## 5 1 0 580.29333
## 6 0 1 -89.72293
This model predicted that those who got in a crash would be fairly younger than those who didn’t. But neither of them had motor vehical points and both had a bachelors degree which I am surprised about.