df_train <- read.csv("/Users/bellajean/Downloads/insurance_training_data.csv")
df_eval <- read.csv("/Users/bellajean/Downloads/insurance-evaluation-data.csv")
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
?stargazer
summary(df_train)
## INDEX TARGET_FLAG TARGET_AMT KIDSDRIV
## Min. : 1 Min. :0.0000 Min. : 0 Min. :0.0000
## 1st Qu.: 2559 1st Qu.:0.0000 1st Qu.: 0 1st Qu.:0.0000
## Median : 5133 Median :0.0000 Median : 0 Median :0.0000
## Mean : 5152 Mean :0.2638 Mean : 1504 Mean :0.1711
## 3rd Qu.: 7745 3rd Qu.:1.0000 3rd Qu.: 1036 3rd Qu.:0.0000
## Max. :10302 Max. :1.0000 Max. :107586 Max. :4.0000
##
## AGE HOMEKIDS YOJ INCOME
## Min. :16.00 Min. :0.0000 Min. : 0.0 Length:8161
## 1st Qu.:39.00 1st Qu.:0.0000 1st Qu.: 9.0 Class :character
## Median :45.00 Median :0.0000 Median :11.0 Mode :character
## Mean :44.79 Mean :0.7212 Mean :10.5
## 3rd Qu.:51.00 3rd Qu.:1.0000 3rd Qu.:13.0
## Max. :81.00 Max. :5.0000 Max. :23.0
## NA's :6 NA's :454
## PARENT1 HOME_VAL MSTATUS SEX
## Length:8161 Length:8161 Length:8161 Length:8161
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## EDUCATION JOB TRAVTIME CAR_USE
## Length:8161 Length:8161 Min. : 5.00 Length:8161
## Class :character Class :character 1st Qu.: 22.00 Class :character
## Mode :character Mode :character Median : 33.00 Mode :character
## Mean : 33.49
## 3rd Qu.: 44.00
## Max. :142.00
##
## BLUEBOOK TIF CAR_TYPE RED_CAR
## Length:8161 Min. : 1.000 Length:8161 Length:8161
## Class :character 1st Qu.: 1.000 Class :character Class :character
## Mode :character Median : 4.000 Mode :character Mode :character
## Mean : 5.351
## 3rd Qu.: 7.000
## Max. :25.000
##
## OLDCLAIM CLM_FREQ REVOKED MVR_PTS
## Length:8161 Min. :0.0000 Length:8161 Min. : 0.000
## Class :character 1st Qu.:0.0000 Class :character 1st Qu.: 0.000
## Mode :character Median :0.0000 Mode :character Median : 1.000
## Mean :0.7986 Mean : 1.696
## 3rd Qu.:2.0000 3rd Qu.: 3.000
## Max. :5.0000 Max. :13.000
##
## CAR_AGE URBANICITY
## Min. :-3.000 Length:8161
## 1st Qu.: 1.000 Class :character
## Median : 8.000 Mode :character
## Mean : 8.328
## 3rd Qu.:12.000
## Max. :28.000
## NA's :510
summary(df_eval)
## INDEX TARGET_FLAG TARGET_AMT KIDSDRIV AGE
## Min. : 3 Mode:logical Mode:logical Min. :0.0000 Min. :17.00
## 1st Qu.: 2632 NA's:2141 NA's:2141 1st Qu.:0.0000 1st Qu.:39.00
## Median : 5224 Median :0.0000 Median :45.00
## Mean : 5150 Mean :0.1625 Mean :45.02
## 3rd Qu.: 7669 3rd Qu.:0.0000 3rd Qu.:51.00
## Max. :10300 Max. :3.0000 Max. :73.00
## NA's :1
## HOMEKIDS YOJ INCOME PARENT1
## Min. :0.0000 Min. : 0.00 Length:2141 Length:2141
## 1st Qu.:0.0000 1st Qu.: 9.00 Class :character Class :character
## Median :0.0000 Median :11.00 Mode :character Mode :character
## Mean :0.7174 Mean :10.38
## 3rd Qu.:1.0000 3rd Qu.:13.00
## Max. :5.0000 Max. :19.00
## NA's :94
## HOME_VAL MSTATUS SEX EDUCATION
## Length:2141 Length:2141 Length:2141 Length:2141
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## JOB TRAVTIME CAR_USE BLUEBOOK
## Length:2141 Min. : 5.00 Length:2141 Length:2141
## Class :character 1st Qu.: 22.00 Class :character Class :character
## Mode :character Median : 33.00 Mode :character Mode :character
## Mean : 33.15
## 3rd Qu.: 43.00
## Max. :105.00
##
## TIF CAR_TYPE RED_CAR OLDCLAIM
## Min. : 1.000 Length:2141 Length:2141 Length:2141
## 1st Qu.: 1.000 Class :character Class :character Class :character
## Median : 4.000 Mode :character Mode :character Mode :character
## Mean : 5.245
## 3rd Qu.: 7.000
## Max. :25.000
##
## CLM_FREQ REVOKED MVR_PTS CAR_AGE
## Min. :0.000 Length:2141 Min. : 0.000 Min. : 0.000
## 1st Qu.:0.000 Class :character 1st Qu.: 0.000 1st Qu.: 1.000
## Median :0.000 Mode :character Median : 1.000 Median : 8.000
## Mean :0.809 Mean : 1.766 Mean : 8.183
## 3rd Qu.:2.000 3rd Qu.: 3.000 3rd Qu.:12.000
## Max. :5.000 Max. :12.000 Max. :26.000
## NA's :129
## URBANICITY
## Length:2141
## Class :character
## Mode :character
##
##
##
##
There are 8161 observations of 26 variables in the training data set, and 2141 observations of 26 variables in the evaluation data set.
To remove the $ and , signs in the numerical data and the z_ signs in the categorical data,
currencyconv = function(input) {
out = sub("\\$", "", input)
out = as.numeric(sub(",", "", out))
return(out)
}
# Replace spaces with underscores
underscore = function(input) {
out = sub(" ", "_", input)
return(out)
}
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
?as.tbl
df_train <- as.tbl(df_train) %>%
mutate_at(c("INCOME","HOME_VAL","BLUEBOOK","OLDCLAIM"),
currencyconv) %>%
mutate_at(c("EDUCATION","JOB","CAR_TYPE","URBANICITY"),
underscore) %>%
mutate_at(c("EDUCATION","JOB","CAR_TYPE","URBANICITY"),
as.factor) %>%
mutate(TARGET_FLAG = as.factor(TARGET_FLAG))
## Warning: `as.tbl()` was deprecated in dplyr 1.0.0.
## ℹ Please use `tibble::as_tibble()` instead.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
df_train
## # A tibble: 8,161 × 26
## INDEX TARGET_FLAG TARGET…¹ KIDSD…² AGE HOMEK…³ YOJ INCOME PARENT1 HOME_…⁴
## <int> <fct> <dbl> <int> <int> <int> <int> <dbl> <chr> <dbl>
## 1 1 0 0 0 60 0 11 67349 No 0
## 2 2 0 0 0 43 0 11 91449 No 257252
## 3 4 0 0 0 35 1 10 16039 No 124191
## 4 5 0 0 0 51 0 14 NA No 306251
## 5 6 0 0 0 50 0 NA 114986 No 243925
## 6 7 1 2946 0 34 1 12 125301 Yes 0
## 7 8 0 0 0 54 0 NA 18755 No NA
## 8 11 1 4021 1 37 2 NA 107961 No 333680
## 9 12 1 2501 0 34 0 10 62978 No 0
## 10 13 0 0 0 50 0 7 106952 No 0
## # … with 8,151 more rows, 16 more variables: MSTATUS <chr>, SEX <chr>,
## # EDUCATION <fct>, JOB <fct>, TRAVTIME <int>, CAR_USE <chr>, BLUEBOOK <dbl>,
## # TIF <int>, CAR_TYPE <fct>, RED_CAR <chr>, OLDCLAIM <dbl>, CLM_FREQ <int>,
## # REVOKED <chr>, MVR_PTS <int>, CAR_AGE <int>, URBANICITY <fct>, and
## # abbreviated variable names ¹TARGET_AMT, ²KIDSDRIV, ³HOMEKIDS, ⁴HOME_VAL
library(knitr)
knitr::kable
## function (x, format, digits = getOption("digits"), row.names = NA,
## col.names = NA, align, caption = NULL, label = NULL, format.args = list(),
## escape = TRUE, ...)
## {
## format = kable_format(format)
## if (!missing(align) && length(align) == 1L && !grepl("[^lcr]",
## align))
## align = strsplit(align, "")[[1]]
## if (inherits(x, "list")) {
## format = kable_format_latex(format)
## res = lapply(x, kable, format = format, digits = digits,
## row.names = row.names, col.names = col.names, align = align,
## caption = NA, format.args = format.args, escape = escape,
## ...)
## return(kables(res, format, caption, label))
## }
## caption = kable_caption(label, caption, format)
## if (!is.matrix(x))
## x = as.data.frame(x)
## if (identical(col.names, NA))
## col.names = colnames(x)
## m = ncol(x)
## isn = if (is.matrix(x))
## rep(is.numeric(x), m)
## else sapply(x, is.numeric)
## if (missing(align) || (format == "latex" && is.null(align)))
## align = ifelse(isn, "r", "l")
## digits = rep(digits, length.out = m)
## for (j in seq_len(m)) {
## if (is_numeric(x[, j]))
## x[, j] = round(x[, j], digits[j])
## }
## if (any(isn)) {
## if (is.matrix(x)) {
## if (is.table(x) && length(dim(x)) == 2)
## class(x) = "matrix"
## x = format_matrix(x, format.args)
## }
## else x[, isn] = format_args(x[, isn], format.args)
## }
## if (is.na(row.names))
## row.names = has_rownames(x)
## if (!is.null(align))
## align = rep(align, length.out = m)
## if (row.names) {
## x = cbind(` ` = rownames(x), x)
## if (!is.null(col.names))
## col.names = c(" ", col.names)
## if (!is.null(align))
## align = c("l", align)
## }
## n = nrow(x)
## x = replace_na(to_character(x), is.na(x))
## if (!is.matrix(x))
## x = matrix(x, nrow = n)
## x = trimws(x)
## colnames(x) = col.names
## if (format != "latex" && length(align) && !all(align %in%
## c("l", "r", "c")))
## stop("'align' must be a character vector of possible values 'l', 'r', and 'c'")
## attr(x, "align") = align
## if (format == "simple" && nrow(x) == 0)
## format = "pipe"
## res = do.call(paste("kable", format, sep = "_"), list(x = x,
## caption = caption, escape = escape, ...))
## structure(res, format = format, class = "knitr_kable")
## }
## <bytecode: 0x7f9a68f62320>
## <environment: namespace:knitr>
remotes::install_github("haozhu233/kableExtra")
## Skipping install of 'kableExtra' from a github remote, the SHA1 (292f6071) has not changed since last install.
## Use `force = TRUE` to force installation
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
summary(df_train) %>% kable() %>% kable_styling()
| INDEX | TARGET_FLAG | TARGET_AMT | KIDSDRIV | AGE | HOMEKIDS | YOJ | INCOME | PARENT1 | HOME_VAL | MSTATUS | SEX | EDUCATION | JOB | TRAVTIME | CAR_USE | BLUEBOOK | TIF | CAR_TYPE | RED_CAR | OLDCLAIM | CLM_FREQ | REVOKED | MVR_PTS | CAR_AGE | URBANICITY | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Min. : 1 | 0:6008 | Min. : 0 | Min. :0.0000 | Min. :16.00 | Min. :0.0000 | Min. : 0.0 | Min. : 0 | Length:8161 | Min. : 0 | Length:8161 | Length:8161 | <High_School :1203 | z_Blue_Collar:1825 | Min. : 5.00 | Length:8161 | Min. : 1500 | Min. : 1.000 | Minivan :2145 | Length:8161 | Min. : 0 | Min. :0.0000 | Length:8161 | Min. : 0.000 | Min. :-3.000 | Highly_Urban/ Urban :6492 | |
| 1st Qu.: 2559 | 1:2153 | 1st Qu.: 0 | 1st Qu.:0.0000 | 1st Qu.:39.00 | 1st Qu.:0.0000 | 1st Qu.: 9.0 | 1st Qu.: 28097 | Class :character | 1st Qu.: 0 | Class :character | Class :character | Bachelors :2242 | Clerical :1271 | 1st Qu.: 22.00 | Class :character | 1st Qu.: 9280 | 1st Qu.: 1.000 | Panel_Truck: 676 | Class :character | 1st Qu.: 0 | 1st Qu.:0.0000 | Class :character | 1st Qu.: 0.000 | 1st Qu.: 1.000 | z_Highly_Rural/ Rural:1669 | |
| Median : 5133 | NA | Median : 0 | Median :0.0000 | Median :45.00 | Median :0.0000 | Median :11.0 | Median : 54028 | Mode :character | Median :161160 | Mode :character | Mode :character | Masters :1658 | Professional :1117 | Median : 33.00 | Mode :character | Median :14440 | Median : 4.000 | Pickup :1389 | Mode :character | Median : 0 | Median :0.0000 | Mode :character | Median : 1.000 | Median : 8.000 | NA | |
| Mean : 5152 | NA | Mean : 1504 | Mean :0.1711 | Mean :44.79 | Mean :0.7212 | Mean :10.5 | Mean : 61898 | NA | Mean :154867 | NA | NA | PhD : 728 | Manager : 988 | Mean : 33.49 | NA | Mean :15710 | Mean : 5.351 | Sports_Car : 907 | NA | Mean : 4037 | Mean :0.7986 | NA | Mean : 1.696 | Mean : 8.328 | NA | |
| 3rd Qu.: 7745 | NA | 3rd Qu.: 1036 | 3rd Qu.:0.0000 | 3rd Qu.:51.00 | 3rd Qu.:1.0000 | 3rd Qu.:13.0 | 3rd Qu.: 85986 | NA | 3rd Qu.:238724 | NA | NA | z_High_School:2330 | Lawyer : 835 | 3rd Qu.: 44.00 | NA | 3rd Qu.:20850 | 3rd Qu.: 7.000 | Van : 750 | NA | 3rd Qu.: 4636 | 3rd Qu.:2.0000 | NA | 3rd Qu.: 3.000 | 3rd Qu.:12.000 | NA | |
| Max. :10302 | NA | Max. :107586 | Max. :4.0000 | Max. :81.00 | Max. :5.0000 | Max. :23.0 | Max. :367030 | NA | Max. :885282 | NA | NA | NA | Student : 712 | Max. :142.00 | NA | Max. :69740 | Max. :25.000 | z_SUV :2294 | NA | Max. :57037 | Max. :5.0000 | NA | Max. :13.000 | Max. :28.000 | NA | |
| NA | NA | NA | NA | NA’s :6 | NA | NA’s :454 | NA’s :445 | NA | NA’s :464 | NA | NA | NA | (Other) :1413 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA’s :510 | NA |
sapply(df_train, function(x) sum(is.na(x))) %>% kable() %>% kable_styling()
| x | |
|---|---|
| INDEX | 0 |
| TARGET_FLAG | 0 |
| TARGET_AMT | 0 |
| KIDSDRIV | 0 |
| AGE | 6 |
| HOMEKIDS | 0 |
| YOJ | 454 |
| INCOME | 445 |
| PARENT1 | 0 |
| HOME_VAL | 464 |
| MSTATUS | 0 |
| SEX | 0 |
| EDUCATION | 0 |
| JOB | 0 |
| TRAVTIME | 0 |
| CAR_USE | 0 |
| BLUEBOOK | 0 |
| TIF | 0 |
| CAR_TYPE | 0 |
| RED_CAR | 0 |
| OLDCLAIM | 0 |
| CLM_FREQ | 0 |
| REVOKED | 0 |
| MVR_PTS | 0 |
| CAR_AGE | 510 |
| URBANICITY | 0 |
# Visualization
library(ggplot2)
library(tidyr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2
## ──
## ✔ tibble 3.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.4 ✔ forcats 1.0.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ kableExtra::group_rows() masks dplyr::group_rows()
## ✖ dplyr::lag() masks stats::lag()
?ggplot
ntrain <- select_if(df_train, is.numeric)
ntrain %>%
keep(is.numeric) %>% # Keep only numeric columns
gather() %>% # Convert to key-value pairs
ggplot(aes(value)) + # Plot the values
facet_wrap(~ key, scales = "free") + # In separate panels
geom_density()
## Warning: Removed 1879 rows containing non-finite values (`stat_density()`).
df_train$AGE[is.na(df_train$AGE)] <- mean(df_train$AGE, na.rm=TRUE)
df_train$YOJ[is.na(df_train$YOJ)] <- mean(df_train$YOJ, na.rm=TRUE)
df_train$HOME_VAL[is.na(df_train$HOME_VAL)] <- mean(df_train$HOME_VAL, na.rm=TRUE)
df_train$CAR_AGE[is.na(df_train$CAR_AGE)] <- mean(df_train$CAR_AGE, na.rm=TRUE)
df_train$INCOME[is.na(df_train$INCOME)] <- mean(df_train$INCOME, na.rm=TRUE)
df_train <- df_train[complete.cases(df_train),]
df_train2 <- df_train
df_train <- df_train[, !(colnames(df_train) %in% c("INDEX"))]
# df_train$new <- df_train$tax / (df_train$medv*10)
trainnum <- dplyr::select_if(df_train, is.numeric)
library("Hmisc")
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
rcorr(as.matrix(trainnum))
## TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL TRAVTIME
## TARGET_AMT 1.00 0.06 -0.04 0.06 -0.02 -0.06 -0.08 0.03
## KIDSDRIV 0.06 1.00 -0.08 0.46 0.04 -0.05 -0.02 0.01
## AGE -0.04 -0.08 1.00 -0.45 0.13 0.18 0.20 0.01
## HOMEKIDS 0.06 0.46 -0.45 1.00 0.08 -0.16 -0.11 -0.01
## YOJ -0.02 0.04 0.13 0.08 1.00 0.27 0.26 -0.02
## INCOME -0.06 -0.05 0.18 -0.16 0.27 1.00 0.54 -0.05
## HOME_VAL -0.08 -0.02 0.20 -0.11 0.26 0.54 1.00 -0.03
## TRAVTIME 0.03 0.01 0.01 -0.01 -0.02 -0.05 -0.03 1.00
## BLUEBOOK 0.00 -0.02 0.16 -0.11 0.14 0.42 0.25 -0.02
## TIF -0.05 0.00 0.00 0.01 0.02 0.00 0.00 -0.01
## OLDCLAIM 0.07 0.02 -0.03 0.03 0.00 -0.04 -0.07 -0.02
## CLM_FREQ 0.12 0.04 -0.02 0.03 -0.03 -0.05 -0.09 0.01
## MVR_PTS 0.14 0.05 -0.07 0.06 -0.04 -0.06 -0.08 0.01
## CAR_AGE -0.06 -0.05 0.17 -0.15 0.06 0.39 0.20 -0.04
## BLUEBOOK TIF OLDCLAIM CLM_FREQ MVR_PTS CAR_AGE
## TARGET_AMT 0.00 -0.05 0.07 0.12 0.14 -0.06
## KIDSDRIV -0.02 0.00 0.02 0.04 0.05 -0.05
## AGE 0.16 0.00 -0.03 -0.02 -0.07 0.17
## HOMEKIDS -0.11 0.01 0.03 0.03 0.06 -0.15
## YOJ 0.14 0.02 0.00 -0.03 -0.04 0.06
## INCOME 0.42 0.00 -0.04 -0.05 -0.06 0.39
## HOME_VAL 0.25 0.00 -0.07 -0.09 -0.08 0.20
## TRAVTIME -0.02 -0.01 -0.02 0.01 0.01 -0.04
## BLUEBOOK 1.00 -0.01 -0.03 -0.04 -0.04 0.18
## TIF -0.01 1.00 -0.02 -0.02 -0.04 0.01
## OLDCLAIM -0.03 -0.02 1.00 0.50 0.26 -0.01
## CLM_FREQ -0.04 -0.02 0.50 1.00 0.40 -0.01
## MVR_PTS -0.04 -0.04 0.26 0.40 1.00 -0.02
## CAR_AGE 0.18 0.01 -0.01 -0.01 -0.02 1.00
##
## n= 8161
##
##
## P
## TARGET_AMT KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL TRAVTIME
## TARGET_AMT 0.0000 0.0002 0.0000 0.0585 0.0000 0.0000 0.0115
## KIDSDRIV 0.0000 0.0000 0.0000 0.0002 0.0000 0.0803 0.4455
## AGE 0.0002 0.0000 0.0000 0.0000 0.0000 0.0000 0.6342
## HOMEKIDS 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.5128
## YOJ 0.0585 0.0002 0.0000 0.0000 0.0000 0.0000 0.1362
## INCOME 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## HOME_VAL 0.0000 0.0803 0.0000 0.0000 0.0000 0.0000 0.0018
## TRAVTIME 0.0115 0.4455 0.6342 0.5128 0.1362 0.0000 0.0018
## BLUEBOOK 0.6712 0.0516 0.0000 0.0000 0.0000 0.0000 0.0000 0.1246
## TIF 0.0000 0.8574 0.9952 0.2859 0.0294 0.9274 0.8569 0.2945
## OLDCLAIM 0.0000 0.0653 0.0082 0.0069 0.7931 0.0000 0.0000 0.0818
## CLM_FREQ 0.0000 0.0008 0.0296 0.0080 0.0210 0.0000 0.0000 0.5535
## MVR_PTS 0.0000 0.0000 0.0000 0.0000 0.0009 0.0000 0.0000 0.3384
## CAR_AGE 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0009
## BLUEBOOK TIF OLDCLAIM CLM_FREQ MVR_PTS CAR_AGE
## TARGET_AMT 0.6712 0.0000 0.0000 0.0000 0.0000 0.0000
## KIDSDRIV 0.0516 0.8574 0.0653 0.0008 0.0000 0.0000
## AGE 0.0000 0.9952 0.0082 0.0296 0.0000 0.0000
## HOMEKIDS 0.0000 0.2859 0.0069 0.0080 0.0000 0.0000
## YOJ 0.0000 0.0294 0.7931 0.0210 0.0009 0.0000
## INCOME 0.0000 0.9274 0.0000 0.0000 0.0000 0.0000
## HOME_VAL 0.0000 0.8569 0.0000 0.0000 0.0000 0.0000
## TRAVTIME 0.1246 0.2945 0.0818 0.5535 0.3384 0.0009
## BLUEBOOK 0.6242 0.0077 0.0010 0.0004 0.0000
## TIF 0.6242 0.0473 0.0375 0.0002 0.4971
## OLDCLAIM 0.0077 0.0473 0.0000 0.0000 0.2402
## CLM_FREQ 0.0010 0.0375 0.0000 0.0000 0.4151
## MVR_PTS 0.0004 0.0002 0.0000 0.0000 0.0816
## CAR_AGE 0.0000 0.4971 0.2402 0.4151 0.0816
library("corrplot")
## corrplot 0.92 loaded
corrplot(cor(trainnum), method="square")
cor.test(trainnum$HOMEKIDS,trainnum$AGE,method="pearson")
##
## Pearson's product-moment correlation
##
## data: trainnum$HOMEKIDS and trainnum$AGE
## t = -44.897, df = 8159, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.4623262 -0.4275266
## sample estimates:
## cor
## -0.4450944
df_train2 <- df_train
#MODEL 1
logit <- glm(formula = TARGET_FLAG ~ . - TARGET_AMT, data=df_train, family = "binomial" (link="logit"))
summary(logit)
##
## Call:
## glm(formula = TARGET_FLAG ~ . - TARGET_AMT, family = binomial(link = "logit"),
## data = df_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5850 -0.7129 -0.3982 0.6260 3.1521
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.262e-01 3.215e-01 -2.881 0.003961 **
## KIDSDRIV 3.862e-01 6.122e-02 6.307 2.84e-10 ***
## AGE -9.998e-04 4.020e-03 -0.249 0.803572
## HOMEKIDS 4.974e-02 3.713e-02 1.340 0.180395
## YOJ -1.119e-02 8.591e-03 -1.302 0.192813
## INCOME -3.423e-06 1.079e-06 -3.172 0.001514 **
## PARENT1Yes 3.820e-01 1.096e-01 3.485 0.000492 ***
## HOME_VAL -1.305e-06 3.423e-07 -3.812 0.000138 ***
## MSTATUSz_No 4.939e-01 8.357e-02 5.910 3.43e-09 ***
## SEXz_F -8.224e-02 1.120e-01 -0.734 0.462910
## EDUCATIONBachelors -3.803e-01 1.156e-01 -3.288 0.001009 **
## EDUCATIONMasters -2.898e-01 1.787e-01 -1.621 0.104941
## EDUCATIONPhD -1.664e-01 2.139e-01 -0.778 0.436721
## EDUCATIONz_High_School 1.782e-02 9.506e-02 0.187 0.851300
## JOBClerical 4.105e-01 1.966e-01 2.087 0.036863 *
## JOBDoctor -4.469e-01 2.671e-01 -1.673 0.094284 .
## JOBHome_Maker 2.315e-01 2.101e-01 1.102 0.270566
## JOBLawyer 1.049e-01 1.695e-01 0.619 0.535737
## JOBManager -5.576e-01 1.716e-01 -3.250 0.001153 **
## JOBProfessional 1.614e-01 1.784e-01 0.904 0.365739
## JOBStudent 2.155e-01 2.145e-01 1.005 0.315098
## JOBz_Blue_Collar 3.100e-01 1.856e-01 1.671 0.094786 .
## TRAVTIME 1.457e-02 1.883e-03 7.737 1.01e-14 ***
## CAR_USEPrivate -7.564e-01 9.172e-02 -8.247 < 2e-16 ***
## BLUEBOOK -2.085e-05 5.262e-06 -3.963 7.40e-05 ***
## TIF -5.546e-02 7.344e-03 -7.552 4.29e-14 ***
## CAR_TYPEPanel_Truck 5.607e-01 1.618e-01 3.466 0.000528 ***
## CAR_TYPEPickup 5.537e-01 1.007e-01 5.497 3.85e-08 ***
## CAR_TYPESports_Car 1.025e+00 1.299e-01 7.890 3.02e-15 ***
## CAR_TYPEVan 6.186e-01 1.265e-01 4.891 1.00e-06 ***
## CAR_TYPEz_SUV 7.681e-01 1.113e-01 6.903 5.09e-12 ***
## RED_CARyes -9.537e-03 8.636e-02 -0.110 0.912067
## OLDCLAIM -1.388e-05 3.910e-06 -3.551 0.000384 ***
## CLM_FREQ 1.960e-01 2.854e-02 6.865 6.65e-12 ***
## REVOKEDYes 8.872e-01 9.133e-02 9.714 < 2e-16 ***
## MVR_PTS 1.132e-01 1.361e-02 8.314 < 2e-16 ***
## CAR_AGE -8.599e-04 7.541e-03 -0.114 0.909213
## URBANICITYz_Highly_Rural/ Rural -2.390e+00 1.128e-01 -21.180 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9418.0 on 8160 degrees of freedom
## Residual deviance: 7297.5 on 8123 degrees of freedom
## AIC: 7373.5
##
## Number of Fisher Scoring iterations: 5
exp(logit$coefficients)
## (Intercept) KIDSDRIV
## 0.39603775 1.47133252
## AGE HOMEKIDS
## 0.99900069 1.05099559
## YOJ INCOME
## 0.98887451 0.99999658
## PARENT1Yes HOME_VAL
## 1.46522755 0.99999869
## MSTATUSz_No SEXz_F
## 1.63869016 0.92105238
## EDUCATIONBachelors EDUCATIONMasters
## 0.68368429 0.74844699
## EDUCATIONPhD EDUCATIONz_High_School
## 0.84674479 1.01797922
## JOBClerical JOBDoctor
## 1.50750416 0.63959396
## JOBHome_Maker JOBLawyer
## 1.26049838 1.11065166
## JOBManager JOBProfessional
## 0.57257892 1.17514024
## JOBStudent JOBz_Blue_Collar
## 1.24042545 1.36344323
## TRAVTIME CAR_USEPrivate
## 1.01467770 0.46934226
## BLUEBOOK TIF
## 0.99997915 0.94605169
## CAR_TYPEPanel_Truck CAR_TYPEPickup
## 1.75190925 1.73975757
## CAR_TYPESports_Car CAR_TYPEVan
## 2.78640582 1.85637992
## CAR_TYPEz_SUV RED_CARyes
## 2.15562973 0.99050868
## OLDCLAIM CLM_FREQ
## 0.99998612 1.21647700
## REVOKEDYes MVR_PTS
## 2.42833966 1.11982364
## CAR_AGE URBANICITYz_Highly_Rural/ Rural
## 0.99914047 0.09165049
logitscalar <- mean(dlogis(predict(logit, type = "link")))
logitscalar * coef(logit)
## (Intercept) KIDSDRIV
## -1.347373e-01 5.617440e-02
## AGE HOMEKIDS
## -1.454380e-04 7.235175e-03
## YOJ INCOME
## -1.627452e-03 -4.979343e-07
## PARENT1Yes HOME_VAL
## 5.556956e-02 -1.898521e-07
## MSTATUSz_No SEXz_F
## 7.184528e-02 -1.196289e-02
## EDUCATIONBachelors EDUCATIONMasters
## -5.531478e-02 -4.214950e-02
## EDUCATIONPhD EDUCATIONz_High_School
## -2.419914e-02 2.592133e-03
## JOBClerical JOBDoctor
## 5.970732e-02 -6.501194e-02
## JOBHome_Maker JOBLawyer
## 3.367643e-02 1.526621e-02
## JOBManager JOBProfessional
## -8.111255e-02 2.347640e-02
## JOBStudent JOBz_Blue_Collar
## 3.134130e-02 4.509641e-02
## TRAVTIME CAR_USEPrivate
## 2.119590e-03 -1.100339e-01
## BLUEBOOK TIF
## -3.033364e-06 -8.067267e-03
## CAR_TYPEPanel_Truck CAR_TYPEPickup
## 8.156371e-02 8.055121e-02
## CAR_TYPESports_Car CAR_TYPEVan
## 1.490667e-01 8.998941e-02
## CAR_TYPEz_SUV RED_CARyes
## 1.117300e-01 -1.387259e-03
## OLDCLAIM CLM_FREQ
## -2.019267e-06 2.850538e-02
## REVOKEDYes MVR_PTS
## 1.290586e-01 1.646257e-02
## CAR_AGE URBANICITYz_Highly_Rural/ Rural
## -1.250859e-04 -3.476308e-01
confint.default(logit)
## 2.5 % 97.5 %
## (Intercept) -1.556324e+00 -2.961675e-01
## KIDSDRIV 2.661701e-01 5.061669e-01
## AGE -8.878262e-03 6.878648e-03
## HOMEKIDS -2.303667e-02 1.225125e-01
## YOJ -2.802553e-02 5.649843e-03
## INCOME -5.538160e-06 -1.307895e-06
## PARENT1Yes 1.671831e-01 5.968380e-01
## HOME_VAL -1.976088e-06 -6.341721e-07
## MSTATUSz_No 3.300966e-01 6.576979e-01
## SEXz_F -3.018165e-01 1.373397e-01
## EDUCATIONBachelors -6.069282e-01 -1.535899e-01
## EDUCATIONMasters -6.400225e-01 6.051271e-02
## EDUCATIONPhD -5.855846e-01 2.528727e-01
## EDUCATIONz_High_School -1.684897e-01 2.041287e-01
## JOBClerical 2.503569e-02 7.958751e-01
## JOBDoctor -9.704343e-01 7.659079e-02
## JOBHome_Maker -1.803304e-01 6.433448e-01
## JOBLawyer -2.272048e-01 4.370987e-01
## JOBManager -8.938381e-01 -2.213713e-01
## JOBProfessional -1.883299e-01 5.111049e-01
## JOBStudent -2.049019e-01 6.358108e-01
## JOBz_Blue_Collar -5.368115e-02 6.737077e-01
## TRAVTIME 1.088005e-02 1.826201e-02
## CAR_USEPrivate -9.361895e-01 -5.766566e-01
## BLUEBOOK -3.116528e-05 -1.054019e-05
## TIF -6.985123e-02 -4.106492e-02
## CAR_TYPEPanel_Truck 2.436267e-01 8.777857e-01
## CAR_TYPEPickup 3.563230e-01 7.511686e-01
## CAR_TYPESports_Car 7.701932e-01 1.279312e+00
## CAR_TYPEVan 3.707454e-01 8.665112e-01
## CAR_TYPEz_SUV 5.500061e-01 9.861597e-01
## RED_CARyes -1.787959e-01 1.597226e-01
## OLDCLAIM -2.154397e-05 -6.218763e-06
## CLM_FREQ 1.400122e-01 2.519058e-01
## REVOKEDYes 7.082022e-01 1.066213e+00
## MVR_PTS 8.649250e-02 1.398499e-01
## CAR_AGE -1.563971e-02 1.391991e-02
## URBANICITYz_Highly_Rural/ Rural -2.610914e+00 -2.168632e+00
predlogit <- predict(logit, type="response")
df_train2$pred1 <- predict(logit, type="response")
summary(predlogit)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.002658 0.077043 0.200903 0.263816 0.402781 0.964607
table(true = df_train$TARGET_FLAG, pred = round(fitted(logit)))
## pred
## true 0 1
## 0 5550 458
## 1 1235 918
It seems that the best predictor for crashes, or has a positive relationship with car accidents, ranked according to the coefficients are: 1. Car Type-Sports 2. License Revoked 3. Car Type-SUV 4. Car Type-Van
Which makes sense, considering that having a sports car will give you the option of going faster and thus putting yourself in higher risk of driving recklessly and crashing. Having had a license revoked also means that there must have been a past record of a driving incident, which would likely mean a higher chance of repeating the same mistake in the future.
The factors that have a negative relationship with car accidents are as ranked below: 1. Job-Doctor 2. Car Use-Private 3. Job-Manager 4. Education-Masters
This also makes sense because doctors tend to be more cautious due to the nature of their occupation. Private cars are also driven less so less opportunities to crash. White-collar jobs and better-educated people also tend to drive more carefully it seems, resulting in less crashes.
#plots for Model 1
par(mfrow=c(2,2))
plot(logit)
data.frame(df_train2$pred1) %>%
ggplot(aes(x = df_train2.pred1)) +
geom_histogram(bins = 50, fill = 'grey50') +
labs(title = 'Histogram of Predictions') +
theme_bw()
options(repos = c(CRAN = "http://cran.rstudio.com/"))
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
?plot.roc
plot.roc(df_train$TARGET_FLAG, df_train2$pred1)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
#extract variables that are significant and rerun model
sigvars <- data.frame(summary(logit)$coef[summary(logit)$coef[,4] <= .05, 4])
sigvars <- add_rownames(sigvars, "vars")
## Warning: `add_rownames()` was deprecated in dplyr 1.0.0.
## ℹ Please use `tibble::rownames_to_column()` instead.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
colist <- dplyr::pull(sigvars, vars)
# colist <- colist[2:11]
colist <- c("KIDSDRIV","INCOME","PARENT1","HOME_VAL","MSTATUS","EDUCATION","JOB","TRAVTIME","CAR_USE","BLUEBOOK","TIF","CAR_TYPE","CLM_FREQ","REVOKED","MVR_PTS","URBANICITY")
idx <- match(colist, names(df_train))
trainmod2 <- cbind(df_train[,idx], df_train2['TARGET_FLAG'])
#MODEL 2
logit2 <- glm(TARGET_FLAG ~ ., data=trainmod2, family = "binomial" (link="logit"))
summary(logit2)
##
## Call:
## glm(formula = TARGET_FLAG ~ ., family = binomial(link = "logit"),
## data = trainmod2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6020 -0.7192 -0.3999 0.6347 3.1387
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.056e+00 2.557e-01 -4.131 3.61e-05 ***
## KIDSDRIV 4.222e-01 5.500e-02 7.677 1.63e-14 ***
## INCOME -3.445e-06 1.071e-06 -3.216 0.00130 **
## PARENT1Yes 4.622e-01 9.411e-02 4.911 9.06e-07 ***
## HOME_VAL -1.356e-06 3.406e-07 -3.982 6.83e-05 ***
## MSTATUSz_No 4.692e-01 7.951e-02 5.902 3.60e-09 ***
## EDUCATIONBachelors -3.817e-01 1.088e-01 -3.509 0.00045 ***
## EDUCATIONMasters -3.046e-01 1.613e-01 -1.889 0.05886 .
## EDUCATIONPhD -1.695e-01 1.997e-01 -0.849 0.39604
## EDUCATIONz_High_School 1.828e-02 9.464e-02 0.193 0.84686
## JOBClerical 4.172e-01 1.962e-01 2.126 0.03348 *
## JOBDoctor -4.343e-01 2.661e-01 -1.632 0.10259
## JOBHome_Maker 2.798e-01 2.038e-01 1.373 0.16980
## JOBLawyer 1.132e-01 1.689e-01 0.670 0.50293
## JOBManager -5.601e-01 1.711e-01 -3.272 0.00107 **
## JOBProfessional 1.673e-01 1.780e-01 0.939 0.34748
## JOBStudent 2.823e-01 2.105e-01 1.341 0.17993
## JOBz_Blue_Collar 3.162e-01 1.852e-01 1.707 0.08784 .
## TRAVTIME 1.460e-02 1.878e-03 7.775 7.52e-15 ***
## CAR_USEPrivate -7.586e-01 9.158e-02 -8.284 < 2e-16 ***
## BLUEBOOK -2.329e-05 4.715e-06 -4.941 7.79e-07 ***
## TIF -5.539e-02 7.330e-03 -7.557 4.11e-14 ***
## CAR_TYPEPanel_Truck 6.211e-01 1.507e-01 4.122 3.76e-05 ***
## CAR_TYPEPickup 5.549e-01 1.006e-01 5.517 3.45e-08 ***
## CAR_TYPESports_Car 9.680e-01 1.074e-01 9.015 < 2e-16 ***
## CAR_TYPEVan 6.453e-01 1.219e-01 5.292 1.21e-07 ***
## CAR_TYPEz_SUV 7.202e-01 8.585e-02 8.388 < 2e-16 ***
## CLM_FREQ 1.496e-01 2.549e-02 5.866 4.46e-09 ***
## REVOKEDYes 7.339e-01 8.022e-02 9.148 < 2e-16 ***
## MVR_PTS 1.100e-01 1.351e-02 8.144 3.82e-16 ***
## URBANICITYz_Highly_Rural/ Rural -2.385e+00 1.127e-01 -21.172 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9418.0 on 8160 degrees of freedom
## Residual deviance: 7314.9 on 8130 degrees of freedom
## AIC: 7376.9
##
## Number of Fisher Scoring iterations: 5
exp(logit2$coefficients)
## (Intercept) KIDSDRIV
## 0.34773874 1.52532752
## INCOME PARENT1Yes
## 0.99999655 1.58754927
## HOME_VAL MSTATUSz_No
## 0.99999864 1.59877425
## EDUCATIONBachelors EDUCATIONMasters
## 0.68270100 0.73738978
## EDUCATIONPhD EDUCATIONz_High_School
## 0.84411734 1.01844512
## JOBClerical JOBDoctor
## 1.51777396 0.64769733
## JOBHome_Maker JOBLawyer
## 1.32285549 1.11980165
## JOBManager JOBProfessional
## 0.57117155 1.18207477
## JOBStudent JOBz_Blue_Collar
## 1.32611593 1.37186955
## TRAVTIME CAR_USEPrivate
## 1.01471068 0.46831392
## BLUEBOOK TIF
## 0.99997671 0.94611218
## CAR_TYPEPanel_Truck CAR_TYPEPickup
## 1.86093753 1.74173381
## CAR_TYPESports_Car CAR_TYPEVan
## 2.63258262 1.90649272
## CAR_TYPEz_SUV CLM_FREQ
## 2.05478342 1.16131156
## REVOKEDYes MVR_PTS
## 2.08308560 1.11632537
## URBANICITYz_Highly_Rural/ Rural
## 0.09204753
logit2scalar <- mean(dlogis(predict(logit2, type = "link")))
logit2scalar * coef(logit2)
## (Intercept) KIDSDRIV
## -1.540887e-01 6.158990e-02
## INCOME PARENT1Yes
## -5.025836e-07 6.742233e-02
## HOME_VAL MSTATUSz_No
## -1.978561e-07 6.845013e-02
## EDUCATIONBachelors EDUCATIONMasters
## -5.568036e-02 -4.443926e-02
## EDUCATIONPhD EDUCATIONz_High_School
## -2.472058e-02 2.666173e-03
## JOBClerical JOBDoctor
## 6.086571e-02 -6.335829e-02
## JOBHome_Maker JOBLawyer
## 4.081484e-02 1.650602e-02
## JOBManager JOBProfessional
## -8.169976e-02 2.440074e-02
## JOBStudent JOBz_Blue_Collar
## 4.117394e-02 4.612205e-02
## TRAVTIME CAR_USEPrivate
## 2.130294e-03 -1.106634e-01
## BLUEBOOK TIF
## -3.398148e-06 -8.080637e-03
## CAR_TYPEPanel_Truck CAR_TYPEPickup
## 9.060030e-02 8.094345e-02
## CAR_TYPESports_Car CAR_TYPEVan
## 1.412023e-01 9.412828e-02
## CAR_TYPEz_SUV CLM_FREQ
## 1.050551e-01 2.181566e-02
## REVOKEDYes MVR_PTS
## 1.070506e-01 1.605247e-02
## URBANICITYz_Highly_Rural/ Rural
## -3.479783e-01
predlogit2 <- predict(logit2, type="response")
df_train2$pred2 <- predict(logit2, type="response")
summary(predlogit2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.002346 0.077433 0.201978 0.263816 0.399281 0.966132
table(true = df_train$TARGET_FLAG, pred = round(fitted(logit2)))
## pred
## true 0 1
## 0 5553 455
## 1 1243 910
#plots for Model 2
par(mfrow=c(2,2))
plot(logit2)
data.frame(df_train2$pred2) %>%
ggplot(aes(x = df_train2.pred2)) +
geom_histogram(bins = 50, fill = 'grey50') +
labs(title = 'Histogram of Predictions') +
theme_bw()
install.packages("pROC")
##
## The downloaded binary packages are in
## /var/folders/_2/ds2qp0zn11v05n6h2k954t800000gn/T//Rtmpx9eFeQ/downloaded_packages
library("pROC")
plot.roc(df_train$TARGET_FLAG, df_train2$pred2)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
#MODEL 3
#PC Model no racial bias
logit3 <- glm(TARGET_FLAG ~ KIDSDRIV + INCOME + HOME_VAL + TRAVTIME, data=df_train, family = "binomial" (link="logit"))
summary(logit3)
##
## Call:
## glm(formula = TARGET_FLAG ~ KIDSDRIV + INCOME + HOME_VAL + TRAVTIME,
## family = binomial(link = "logit"), data = df_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6189 -0.8209 -0.6767 1.2589 2.8068
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.785e-01 7.291e-02 -9.306 < 2e-16 ***
## KIDSDRIV 3.998e-01 4.591e-02 8.707 < 2e-16 ***
## INCOME -3.502e-06 6.824e-07 -5.132 2.87e-07 ***
## HOME_VAL -2.973e-06 2.498e-07 -11.903 < 2e-16 ***
## TRAVTIME 5.842e-03 1.598e-03 3.656 0.000257 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9418.0 on 8160 degrees of freedom
## Residual deviance: 9024.8 on 8156 degrees of freedom
## AIC: 9034.8
##
## Number of Fisher Scoring iterations: 4
exp(logit3$coefficients)
## (Intercept) KIDSDRIV INCOME HOME_VAL TRAVTIME
## 0.5073733 1.4914800 0.9999965 0.9999970 1.0058590
predlogit3 <- predict(logit3, type="response")
df_train2$pred3 <- predict(logit3, type="response")
summary(predlogit3)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01182 0.19779 0.25596 0.26382 0.32923 0.75815
table(true = df_train$TARGET_FLAG, pred = round(fitted(logit3)))
## pred
## true 0 1
## 0 5940 68
## 1 2093 60
#plots for Model 3
par(mfrow=c(2,2))
plot(logit3)
data.frame(df_train2$pred3) %>%
ggplot(aes(x = df_train2.pred3)) +
geom_histogram(bins = 50, fill = 'grey50') +
labs(title = 'Histogram of Predictions') +
theme_bw()
plot.roc(df_train$TARGET_FLAG, df_train2$pred3)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
logit3scalar <- mean(dlogis(predict(logit3, type = "link")))
logit3scalar * coef(logit3)
## (Intercept) KIDSDRIV INCOME HOME_VAL TRAVTIME
## -1.255691e-01 7.398379e-02 -6.480644e-07 -5.501764e-07 1.081145e-03
round(logitscalar * coef(logit),2)
## (Intercept) KIDSDRIV
## -0.13 0.06
## AGE HOMEKIDS
## 0.00 0.01
## YOJ INCOME
## 0.00 0.00
## PARENT1Yes HOME_VAL
## 0.06 0.00
## MSTATUSz_No SEXz_F
## 0.07 -0.01
## EDUCATIONBachelors EDUCATIONMasters
## -0.06 -0.04
## EDUCATIONPhD EDUCATIONz_High_School
## -0.02 0.00
## JOBClerical JOBDoctor
## 0.06 -0.07
## JOBHome_Maker JOBLawyer
## 0.03 0.02
## JOBManager JOBProfessional
## -0.08 0.02
## JOBStudent JOBz_Blue_Collar
## 0.03 0.05
## TRAVTIME CAR_USEPrivate
## 0.00 -0.11
## BLUEBOOK TIF
## 0.00 -0.01
## CAR_TYPEPanel_Truck CAR_TYPEPickup
## 0.08 0.08
## CAR_TYPESports_Car CAR_TYPEVan
## 0.15 0.09
## CAR_TYPEz_SUV RED_CARyes
## 0.11 0.00
## OLDCLAIM CLM_FREQ
## 0.00 0.03
## REVOKEDYes MVR_PTS
## 0.13 0.02
## CAR_AGE URBANICITYz_Highly_Rural/ Rural
## 0.00 -0.35
round(logit2scalar * coef(logit2),2)
## (Intercept) KIDSDRIV
## -0.15 0.06
## INCOME PARENT1Yes
## 0.00 0.07
## HOME_VAL MSTATUSz_No
## 0.00 0.07
## EDUCATIONBachelors EDUCATIONMasters
## -0.06 -0.04
## EDUCATIONPhD EDUCATIONz_High_School
## -0.02 0.00
## JOBClerical JOBDoctor
## 0.06 -0.06
## JOBHome_Maker JOBLawyer
## 0.04 0.02
## JOBManager JOBProfessional
## -0.08 0.02
## JOBStudent JOBz_Blue_Collar
## 0.04 0.05
## TRAVTIME CAR_USEPrivate
## 0.00 -0.11
## BLUEBOOK TIF
## 0.00 -0.01
## CAR_TYPEPanel_Truck CAR_TYPEPickup
## 0.09 0.08
## CAR_TYPESports_Car CAR_TYPEVan
## 0.14 0.09
## CAR_TYPEz_SUV CLM_FREQ
## 0.11 0.02
## REVOKEDYes MVR_PTS
## 0.11 0.02
## URBANICITYz_Highly_Rural/ Rural
## -0.35
round(logit3scalar * coef(logit3),2)
## (Intercept) KIDSDRIV INCOME HOME_VAL TRAVTIME
## -0.13 0.07 0.00 0.00 0.00
#MODEL 1
model <- lm(TARGET_AMT ~ ., data = df_train)
summary(model)
##
## Call:
## lm(formula = TARGET_AMT ~ ., data = df_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6287 -469 -57 237 101119
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.912e+02 4.878e+02 -1.212 0.225619
## TARGET_FLAG1 5.705e+03 1.136e+02 50.241 < 2e-16 ***
## KIDSDRIV -3.105e+01 9.909e+01 -0.313 0.754024
## AGE 5.967e+00 6.171e+00 0.967 0.333637
## HOMEKIDS 3.984e+01 5.709e+01 0.698 0.485235
## YOJ 7.892e+00 1.319e+01 0.598 0.549525
## INCOME -2.234e-03 1.577e-03 -1.417 0.156459
## PARENT1Yes 1.407e+02 1.767e+02 0.797 0.425723
## HOME_VAL 3.955e-04 5.163e-04 0.766 0.443683
## MSTATUSz_No 1.692e+02 1.267e+02 1.335 0.181839
## SEXz_F -2.873e+02 1.605e+02 -1.789 0.073572 .
## EDUCATIONBachelors 6.843e+01 1.790e+02 0.382 0.702279
## EDUCATIONMasters 2.237e+02 2.620e+02 0.854 0.393284
## EDUCATIONPhD 4.296e+02 3.110e+02 1.382 0.167133
## EDUCATIONz_High_School -1.229e+02 1.502e+02 -0.818 0.413279
## JOBClerical -4.752e+00 2.984e+02 -0.016 0.987294
## JOBDoctor -2.788e+02 3.571e+02 -0.781 0.434921
## JOBHome_Maker -6.546e+01 3.185e+02 -0.206 0.837182
## JOBLawyer 7.787e+01 2.583e+02 0.302 0.763035
## JOBManager -1.212e+02 2.521e+02 -0.481 0.630525
## JOBProfessional 1.764e+02 2.698e+02 0.654 0.513241
## JOBStudent -1.269e+02 3.267e+02 -0.388 0.697709
## JOBz_Blue_Collar 5.766e+01 2.813e+02 0.205 0.837595
## TRAVTIME 5.649e-01 2.824e+00 0.200 0.841445
## CAR_USEPrivate -9.639e+01 1.443e+02 -0.668 0.504033
## BLUEBOOK 2.930e-02 7.536e-03 3.889 0.000102 ***
## TIF -2.939e+00 1.068e+01 -0.275 0.783072
## CAR_TYPEPanel_Truck -5.226e+01 2.431e+02 -0.215 0.829783
## CAR_TYPEPickup -3.071e+01 1.493e+02 -0.206 0.837045
## CAR_TYPESports_Car 2.049e+02 1.909e+02 1.073 0.283203
## CAR_TYPEVan 9.584e+01 1.864e+02 0.514 0.607236
## CAR_TYPEz_SUV 1.602e+02 1.571e+02 1.020 0.307680
## RED_CARyes -2.857e+01 1.302e+02 -0.219 0.826320
## OLDCLAIM 3.060e-03 6.501e-03 0.471 0.637817
## CLM_FREQ -4.347e+01 4.821e+01 -0.902 0.367271
## REVOKEDYes -3.277e+02 1.526e+02 -2.148 0.031763 *
## MVR_PTS 5.398e+01 2.277e+01 2.371 0.017771 *
## CAR_AGE -2.502e+01 1.118e+01 -2.238 0.025219 *
## URBANICITYz_Highly_Rural/ Rural 3.364e+01 1.263e+02 0.266 0.790056
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3969 on 8122 degrees of freedom
## Multiple R-squared: 0.2913, Adjusted R-squared: 0.288
## F-statistic: 87.85 on 38 and 8122 DF, p-value: < 2.2e-16
par(mfrow=c(1,2))
plot(model$residuals ~ model$fitted.values)
plot(model$fitted.values,df_train$TARGET_AMT)
par(mfrow=c(2,2))
plot(model)
#extract variables that are significant and rerun model
sigvars <- data.frame(summary(model)$coef[summary(model)$coef[,4] <= .05, 4])
sigvars <- add_rownames(sigvars, "vars")
## Warning: `add_rownames()` was deprecated in dplyr 1.0.0.
## ℹ Please use `tibble::rownames_to_column()` instead.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
colist<-dplyr::pull(sigvars, vars)
colist<-c("TARGET_FLAG","BLUEBOOK","REVOKED","MVR_PTS","CAR_AGE")
idx <- match(colist, names(df_train))
trainmod2 <- cbind(df_train[,idx], df_train['TARGET_AMT'])
#MODEL 2
model2<-lm(TARGET_AMT ~ ., data = trainmod2)
summary(model2)
##
## Call:
## lm(formula = TARGET_AMT ~ ., data = trainmod2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6350 -383 -30 193 101423
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.233e+02 1.183e+02 -3.578 0.000349 ***
## TARGET_FLAG1 5.724e+03 1.040e+02 55.036 < 2e-16 ***
## BLUEBOOK 3.010e-02 5.328e-03 5.650 1.65e-08 ***
## REVOKEDYes -2.878e+02 1.355e+02 -2.123 0.033774 *
## MVR_PTS 5.081e+01 2.097e+01 2.423 0.015430 *
## CAR_AGE -1.277e+01 8.121e+00 -1.572 0.115961
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3967 on 8155 degrees of freedom
## Multiple R-squared: 0.2891, Adjusted R-squared: 0.2887
## F-statistic: 663.3 on 5 and 8155 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(model2$residuals ~ model2$fitted.values)
plot(model2$fitted.values,df_train$TARGET_AMT)
par(mfrow=c(2,2))
plot(model2)
par(mfrow=c(1,2))
plot(model2$residuals ~ model2$fitted.values, main="New Reduced Var Model")
abline(h = 0)
plot(model$residuals ~ model$fitted.values, main="Orignal Model All Vars")
abline(h = 0)
#MODEL 3
#remove variables with opposite coefficients
model3<-lm(TARGET_AMT ~ KIDSDRIV + INCOME + HOME_VAL + TRAVTIME, data = df_train)
summary(model3)
##
## Call:
## lm(formula = TARGET_AMT ~ KIDSDRIV + INCOME + HOME_VAL + TRAVTIME,
## data = df_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3816 -1652 -1248 -324 106267
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.693e+03 1.468e+02 11.527 < 2e-16 ***
## KIDSDRIV 4.889e+02 1.014e+02 4.822 1.45e-06 ***
## INCOME -1.259e-03 1.336e-03 -0.943 0.3459
## HOME_VAL -2.812e-03 4.921e-04 -5.714 1.14e-08 ***
## TRAVTIME 7.208e+00 3.261e+00 2.211 0.0271 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4680 on 8156 degrees of freedom
## Multiple R-squared: 0.01059, Adjusted R-squared: 0.01011
## F-statistic: 21.83 on 4 and 8156 DF, p-value: < 2.2e-16
par(mfrow=c(1,2))
plot(model3$residuals ~ model3$fitted.values)
plot(model3$fitted.values,df_train$TARGET_AMT)
par(mfrow=c(2,2))
plot(model3)
test <- read.csv("/Users/bellajean/Downloads/insurance-evaluation-data.csv")
test2 <- test
dim(test)
## [1] 2141 26
test$TARGET_AMT <- 0
test$TARGET_FLAG <- 0
test = as.tbl(test) %>%
mutate_at(c("INCOME","HOME_VAL","BLUEBOOK","OLDCLAIM"),
currencyconv) %>%
mutate_at(c("EDUCATION","JOB","CAR_TYPE","URBANICITY"),
underscore) %>%
mutate_at(c("EDUCATION","JOB","CAR_TYPE","URBANICITY"),
as.factor) %>%
mutate(TARGET_FLAG = as.factor(TARGET_FLAG))
## Warning: `as.tbl()` was deprecated in dplyr 1.0.0.
## ℹ Please use `tibble::as_tibble()` instead.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# impute data for missing values
# use column mean for calculation
test$HOMEKIDS <- log(test$HOMEKIDS+1)
test$MVR_PTS <- log(test$MVR_PTS+1)
test$OLDCLAIM <- log(test$OLDCLAIM+1)
test$TIF <- log(test$TIF+1)
test$KIDSDRIV <- log(test$KIDSDRIV+1)
test$CLM_FREQ <- log(test$CLM_FREQ+1)
# use column mean for calculation
test$AGE[is.na(test$AGE)] <- mean(test$AGE, na.rm=TRUE)
test$YOJ[is.na(test$YOJ)] <- mean(test$YOJ, na.rm=TRUE)
test$HOME_VAL[is.na(test$HOME_VAL)] <- mean(test$HOME_VAL, na.rm=TRUE)
test$CAR_AGE[is.na(test$CAR_AGE)] <- mean(test$CAR_AGE, na.rm=TRUE)
test$INCOME[is.na(test$INCOME)] <- mean(test$INCOME, na.rm=TRUE)
#get complete cases
#remove rad per correlation in prior section
test <- test[, !(colnames(test) %in% c("INDEX"))]
TARGET_FLAG <- predict(logit, newdata = test, type="response")
y_pred_num <- ifelse(TARGET_FLAG > 0.5, 1, 0)
y_pred <- factor(y_pred_num, levels=c(0, 1))
summary(y_pred)
## 0 1
## 1812 329
rbind(round(summary(predlogit),4), round(summary(TARGET_FLAG),4)) %>% kable()
| Min. | 1st Qu. | Median | Mean | 3rd Qu. | Max. |
|---|---|---|---|---|---|
| 0.0027 | 0.0770 | 0.2009 | 0.2638 | 0.4028 | 0.9646 |
| 0.0042 | 0.0865 | 0.2213 | 0.2680 | 0.4035 | 0.9216 |
test$TARGET_FLAG <- as.factor(test$TARGET_FLAG)
test2 <- test[, !(colnames(test) %in% c("TARGET_FLAG"))]
TARGET_AMT<- predict(model, newdata = test, interval='confidence') #data from scaling originally to get to actual wins
summary(TARGET_AMT)
## fit lwr upr
## Min. :-1310.19 Min. :-1935.9 Min. :-684.4
## 1st Qu.: -297.30 1st Qu.: -800.7 1st Qu.: 193.8
## Median : -77.68 Median : -559.5 Median : 402.8
## Mean : -59.14 Mean : -569.1 Mean : 450.8
## 3rd Qu.: 171.05 3rd Qu.: -329.4 3rd Qu.: 682.8
## Max. : 1172.44 Max. : 452.2 Max. :1899.9
summary(model)
##
## Call:
## lm(formula = TARGET_AMT ~ ., data = df_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6287 -469 -57 237 101119
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.912e+02 4.878e+02 -1.212 0.225619
## TARGET_FLAG1 5.705e+03 1.136e+02 50.241 < 2e-16 ***
## KIDSDRIV -3.105e+01 9.909e+01 -0.313 0.754024
## AGE 5.967e+00 6.171e+00 0.967 0.333637
## HOMEKIDS 3.984e+01 5.709e+01 0.698 0.485235
## YOJ 7.892e+00 1.319e+01 0.598 0.549525
## INCOME -2.234e-03 1.577e-03 -1.417 0.156459
## PARENT1Yes 1.407e+02 1.767e+02 0.797 0.425723
## HOME_VAL 3.955e-04 5.163e-04 0.766 0.443683
## MSTATUSz_No 1.692e+02 1.267e+02 1.335 0.181839
## SEXz_F -2.873e+02 1.605e+02 -1.789 0.073572 .
## EDUCATIONBachelors 6.843e+01 1.790e+02 0.382 0.702279
## EDUCATIONMasters 2.237e+02 2.620e+02 0.854 0.393284
## EDUCATIONPhD 4.296e+02 3.110e+02 1.382 0.167133
## EDUCATIONz_High_School -1.229e+02 1.502e+02 -0.818 0.413279
## JOBClerical -4.752e+00 2.984e+02 -0.016 0.987294
## JOBDoctor -2.788e+02 3.571e+02 -0.781 0.434921
## JOBHome_Maker -6.546e+01 3.185e+02 -0.206 0.837182
## JOBLawyer 7.787e+01 2.583e+02 0.302 0.763035
## JOBManager -1.212e+02 2.521e+02 -0.481 0.630525
## JOBProfessional 1.764e+02 2.698e+02 0.654 0.513241
## JOBStudent -1.269e+02 3.267e+02 -0.388 0.697709
## JOBz_Blue_Collar 5.766e+01 2.813e+02 0.205 0.837595
## TRAVTIME 5.649e-01 2.824e+00 0.200 0.841445
## CAR_USEPrivate -9.639e+01 1.443e+02 -0.668 0.504033
## BLUEBOOK 2.930e-02 7.536e-03 3.889 0.000102 ***
## TIF -2.939e+00 1.068e+01 -0.275 0.783072
## CAR_TYPEPanel_Truck -5.226e+01 2.431e+02 -0.215 0.829783
## CAR_TYPEPickup -3.071e+01 1.493e+02 -0.206 0.837045
## CAR_TYPESports_Car 2.049e+02 1.909e+02 1.073 0.283203
## CAR_TYPEVan 9.584e+01 1.864e+02 0.514 0.607236
## CAR_TYPEz_SUV 1.602e+02 1.571e+02 1.020 0.307680
## RED_CARyes -2.857e+01 1.302e+02 -0.219 0.826320
## OLDCLAIM 3.060e-03 6.501e-03 0.471 0.637817
## CLM_FREQ -4.347e+01 4.821e+01 -0.902 0.367271
## REVOKEDYes -3.277e+02 1.526e+02 -2.148 0.031763 *
## MVR_PTS 5.398e+01 2.277e+01 2.371 0.017771 *
## CAR_AGE -2.502e+01 1.118e+01 -2.238 0.025219 *
## URBANICITYz_Highly_Rural/ Rural 3.364e+01 1.263e+02 0.266 0.790056
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3969 on 8122 degrees of freedom
## Multiple R-squared: 0.2913, Adjusted R-squared: 0.288
## F-statistic: 87.85 on 38 and 8122 DF, p-value: < 2.2e-16
Model 1 wins