library(tidyverse)
library(janitor)
library(readr)
library(gtsummary)
library(summarytools)
library(kableExtra)
library(knitr)
library(gridExtra)
library(summarytools)
library(randomForest)
library(reshape2)
library(tidymodels)
df <- read_csv("datasets/train.csv")
glimpse(df)
## Rows: 381,109
## Columns: 12
## $ id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
## $ Gender <chr> "Male", "Male", "Male", "Male", "Female", "Female…
## $ Age <dbl> 44, 76, 47, 21, 29, 24, 23, 56, 24, 32, 47, 24, 4…
## $ Driving_License <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ Region_Code <dbl> 28, 3, 28, 11, 41, 33, 11, 28, 3, 6, 35, 50, 15, …
## $ Previously_Insured <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0…
## $ Vehicle_Age <chr> "> 2 Years", "1-2 Year", "> 2 Years", "< 1 Year",…
## $ Vehicle_Damage <chr> "Yes", "No", "Yes", "No", "No", "Yes", "Yes", "Ye…
## $ Annual_Premium <dbl> 40454, 33536, 38294, 28619, 27496, 2630, 23367, 3…
## $ Policy_Sales_Channel <dbl> 26, 26, 26, 152, 152, 160, 152, 26, 152, 152, 124…
## $ Vintage <dbl> 217, 183, 27, 203, 39, 176, 249, 72, 28, 80, 46, …
## $ Response <dbl> 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0…
df1 <- janitor::clean_names(df) %>%
rename(days_associated = vintage,
health_annual_paid = annual_premium) %>%
mutate(
across(where(is.character), tolower),
driving_license = ifelse(driving_license == 1, "yes", "no"),
previously_insured = ifelse(previously_insured == 1, "yes", "no"),
response = ifelse(response == 1, "yes", "no"),
vehicle_age = case_when(
vehicle_age == "< 1 year" ~ "below_1_year",
vehicle_age == "1-2 year" ~ "between_1_2_years",
vehicle_age == "> 2 years" ~ "over_2_years"
)
) %>%
mutate_if(is.character, as.factor) %>%
mutate(response = factor(response, levels = c("yes", "no")),
driving_license = factor(driving_license, levels = c("yes", "no")),
previously_insured = factor(previously_insured, levels = c("yes", "no")),
vehicle_damage = factor(vehicle_damage, levels = c("yes", "no"))
)
glimpse(df1)
## Rows: 381,109
## Columns: 12
## $ id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
## $ gender <fct> male, male, male, male, female, female, male, fem…
## $ age <dbl> 44, 76, 47, 21, 29, 24, 23, 56, 24, 32, 47, 24, 4…
## $ driving_license <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes,…
## $ region_code <dbl> 28, 3, 28, 11, 41, 33, 11, 28, 3, 6, 35, 50, 15, …
## $ previously_insured <fct> no, no, no, yes, yes, no, no, no, yes, yes, no, y…
## $ vehicle_age <fct> over_2_years, between_1_2_years, over_2_years, be…
## $ vehicle_damage <fct> yes, no, yes, no, no, yes, yes, yes, no, no, yes,…
## $ health_annual_paid <dbl> 40454, 33536, 38294, 28619, 27496, 2630, 23367, 3…
## $ policy_sales_channel <dbl> 26, 26, 26, 152, 152, 160, 152, 26, 152, 152, 124…
## $ days_associated <dbl> 217, 183, 27, 203, 39, 176, 249, 72, 28, 80, 46, …
## $ response <fct> yes, no, yes, no, no, no, no, yes, no, no, yes, n…
# save df_cleaned as RDS
saveRDS(df1, "df_cleaned.rds")
variable_classes <- tibble(variables = names(df1),
type = unlist(lapply(df1, class)))
variable_classes
variables <- df1 %>% names()
description <- c(
"Unique ID for the customer",
"Gender of the customer",
"Age of the customer",
"Customer has DL (yes/no)",
"Unique code for the region of the customer",
"Customer already has Vehicle Insurance (yes/no)",
"Age of the Vehicle",
"Customer got his/her vehicle damaged in the past (yes/no)",
"The amount customer needs to pay as premium in the year",
"Anonymized Code for the channel of outreaching to the customer ie. Different Agents, Over Mail, Over Phone, In Person, etc.",
"Number of Days, Customer has been associated with the company",
"Customer is interested in car insurance (yes/no)"
)
df_description <- tibble(variables = variables,
description = description)
kable(df_description, format = "html") %>%
kableExtra::kable_styling(bootstrap_options = "striped",
full_width = FALSE)
variables | description |
---|---|
id | Unique ID for the customer |
gender | Gender of the customer |
age | Age of the customer |
driving_license | Customer has DL (yes/no) |
region_code | Unique code for the region of the customer |
previously_insured | Customer already has Vehicle Insurance (yes/no) |
vehicle_age | Age of the Vehicle |
vehicle_damage | Customer got his/her vehicle damaged in the past (yes/no) |
health_annual_paid | The amount customer needs to pay as premium in the year |
policy_sales_channel | Anonymized Code for the channel of outreaching to the customer ie. Different Agents, Over Mail, Over Phone, In Person, etc. |
days_associated | Number of Days, Customer has been associated with the company |
response | Customer is interested in car insurance (yes/no) |
# Read cleaned data
df_cleaned <- readRDS("df_cleaned.rds")
glimpse(df_cleaned)
## Rows: 381,109
## Columns: 12
## $ id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
## $ gender <fct> male, male, male, male, female, female, male, fem…
## $ age <dbl> 44, 76, 47, 21, 29, 24, 23, 56, 24, 32, 47, 24, 4…
## $ driving_license <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes,…
## $ region_code <dbl> 28, 3, 28, 11, 41, 33, 11, 28, 3, 6, 35, 50, 15, …
## $ previously_insured <fct> no, no, no, yes, yes, no, no, no, yes, yes, no, y…
## $ vehicle_age <fct> over_2_years, between_1_2_years, over_2_years, be…
## $ vehicle_damage <fct> yes, no, yes, no, no, yes, yes, yes, no, no, yes,…
## $ health_annual_paid <dbl> 40454, 33536, 38294, 28619, 27496, 2630, 23367, 3…
## $ policy_sales_channel <dbl> 26, 26, 26, 152, 152, 160, 152, 26, 152, 152, 124…
## $ days_associated <dbl> 217, 183, 27, 203, 39, 176, 249, 72, 28, 80, 46, …
## $ response <fct> yes, no, yes, no, no, no, no, yes, no, no, yes, n…
skimr::skim(df_cleaned)
Name | df_cleaned |
Number of rows | 381109 |
Number of columns | 12 |
_______________________ | |
Column type frequency: | |
factor | 6 |
numeric | 6 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
gender | 0 | 1 | FALSE | 2 | mal: 206089, fem: 175020 |
driving_license | 0 | 1 | FALSE | 2 | yes: 380297, no: 812 |
previously_insured | 0 | 1 | FALSE | 2 | no: 206481, yes: 174628 |
vehicle_age | 0 | 1 | FALSE | 3 | bet: 200316, bel: 164786, ove: 16007 |
vehicle_damage | 0 | 1 | FALSE | 2 | yes: 192413, no: 188696 |
response | 0 | 1 | FALSE | 2 | no: 334399, yes: 46710 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
id | 0 | 1 | 190555.00 | 110016.84 | 1 | 95278 | 190555 | 285832 | 381109 | ▇▇▇▇▇ |
age | 0 | 1 | 38.82 | 15.51 | 20 | 25 | 36 | 49 | 85 | ▇▃▃▂▁ |
region_code | 0 | 1 | 26.39 | 13.23 | 0 | 15 | 28 | 35 | 52 | ▃▂▇▃▃ |
health_annual_paid | 0 | 1 | 30564.39 | 17213.16 | 2630 | 24405 | 31669 | 39400 | 540165 | ▇▁▁▁▁ |
policy_sales_channel | 0 | 1 | 112.03 | 54.20 | 1 | 29 | 133 | 152 | 163 | ▅▁▁▃▇ |
days_associated | 0 | 1 | 154.35 | 83.67 | 10 | 82 | 154 | 227 | 299 | ▇▇▇▇▇ |
df_cleaned %>%
select(-id) %>%
tbl_summary(
type = list(response ~ "categorical",
driving_license ~ "categorical",
previously_insured ~ "categorical",
vehicle_damage ~ "categorical"),
digits = list(all_categorical() ~ c(0, 2))
)
Characteristic | N = 381,1091 |
---|---|
gender | |
female | 175,020 (45.92%) |
male | 206,089 (54.08%) |
age | 36 (25, 49) |
driving_license | |
yes | 380,297 (99.79%) |
no | 812 (0.21%) |
region_code | 28 (15, 35) |
previously_insured | |
yes | 174,628 (45.82%) |
no | 206,481 (54.18%) |
vehicle_age | |
below_1_year | 164,786 (43.24%) |
between_1_2_years | 200,316 (52.56%) |
over_2_years | 16,007 (4.20%) |
vehicle_damage | |
yes | 192,413 (50.49%) |
no | 188,696 (49.51%) |
health_annual_paid | 31,669 (24,405, 39,400) |
policy_sales_channel | 133 (29, 152) |
days_associated | 154 (82, 227) |
response | |
yes | 46,710 (12.26%) |
no | 334,399 (87.74%) |
1 n (%); Median (IQR) |
num_attributes <- df_cleaned %>%
select(age, health_annual_paid, days_associated)
descriptive_tab <- summarytools::descr(num_attributes, style = "rmarkdown") %>% round(2)
## Error : Can't find summarytools
kable(data.frame(descriptive_tab), format = "html") %>%
kableExtra::kable_styling(bootstrap_options = "striped",
full_width = FALSE)
age | days_associated | health_annual_paid | |
---|---|---|---|
Mean | 38.82 | 154.35 | 30564.39 |
Std.Dev | 15.51 | 83.67 | 17213.16 |
Min | 20.00 | 10.00 | 2630.00 |
Q1 | 25.00 | 82.00 | 24405.00 |
Median | 36.00 | 154.00 | 31669.00 |
Q3 | 49.00 | 227.00 | 39400.00 |
Max | 85.00 | 299.00 | 540165.00 |
MAD | 17.79 | 108.23 | 11125.43 |
IQR | 24.00 | 145.00 | 14995.00 |
CV | 0.40 | 0.54 | 0.56 |
Skewness | 0.67 | 0.00 | 1.77 |
SE.Skewness | 0.00 | 0.00 | 0.00 |
Kurtosis | -0.57 | -1.20 | 34.00 |
N.Valid | 381109.00 | 381109.00 | 381109.00 |
Pct.Valid | 100.00 | 100.00 | 100.00 |
# Age ---------------------
age_plt <- num_attributes %>%
ggplot(aes(x = age)) +
geom_histogram(aes(y = after_stat(density)), binwidth = 1,
color = "gray", fill = "lightblue", alpha = 0.5) + geom_density(color = "blue") +
labs(x = "Age", y = "Density", title = "Customers \nAge Distribution") +
theme_minimal()
# health_annual_paid ---------------------
paid_plt <- num_attributes %>%
ggplot(aes(x = health_annual_paid)) +
geom_histogram(aes(y = after_stat(density)),
binwidth = 10000,
color = "gray", fill = "lightblue", alpha = 0.5) + geom_density(color = "blue") +
labs(x = "Health Annual Paid", y = "Density", title = "Customers \nPayments Distribution") +
theme_minimal()
# days_associated ---------------------
days_plt <- num_attributes %>%
ggplot(aes(x = days_associated)) +
geom_histogram(aes(y = after_stat(density)),
color = "gray", fill = "lightblue", alpha = 0.5) + geom_density(color = "blue") +
labs(x = "Days Associated", y = "Density", title = "Customers Days \nAssociated \nDistribution") +
theme_minimal()
gridExtra::grid.arrange(age_plt, paid_plt, days_plt, ncol = 3)
num_names <- names(num_attributes)
cat_attributes <- df_cleaned %>%
select(-id, -one_of(num_names))
gender_plt <- cat_attributes %>%
ggplot(aes(x = gender)) +
geom_bar(aes(fill = gender)) +
labs(x = "Gender", y = "#",
title = "Customers Gender") +
theme_minimal()
driving_license_plt <- cat_attributes %>%
ggplot(aes(x = driving_license)) +
geom_bar(aes(fill = driving_license),
show.legend = FALSE) +
labs(x = "Driving License", y = "#",
title = "Customers \nDriving License") +
theme_minimal()
region_code_plt <- cat_attributes %>%
ggplot(aes(x = region_code)) +
geom_bar(aes(fill = factor(region_code)),
show.legend = FALSE) +
labs(x = "Region Code", y = "#",
title = "Customers \nRegion Code") +
theme_minimal()
previously_insured_plt <- cat_attributes %>%
ggplot(aes(x = previously_insured)) +
geom_bar(aes(fill = previously_insured),
show.legend = FALSE) +
labs(x = "Previously Insured", y = "#",
title = "Customers \nPreviously Insured") +
theme_minimal()
vehicle_age_plt <- cat_attributes %>%
ggplot(aes(x = vehicle_age)) +
geom_bar(aes(fill = vehicle_age),
show.legend = FALSE) +
labs(x = "vehicle_age", y = "#",
title = "Customers \nVehicle Age") +
theme_minimal()
vehicle_damage_plt <- cat_attributes %>%
ggplot(aes(x = vehicle_damage)) +
geom_bar(aes(fill = vehicle_damage),
show.legend = FALSE) +
labs(x = "vehicle_damage", y = "#",
title = "Customers \nVehicle Damage") +
theme_minimal()
policy_sales_channel_plt <- cat_attributes %>%
ggplot(aes(x = policy_sales_channel)) +
geom_bar(aes(fill = factor(policy_sales_channel)),
show.legend = FALSE) +
labs(x = "policy_sales_channel", y = "#",
title = "Customers \nPolicy Sales Channel") +
theme_minimal()
response_plt <- cat_attributes %>%
ggplot(aes(x = response)) +
geom_bar(aes(fill = response),
show.legend = FALSE) +
labs(x = "response", y = "#",
title = "Customers response") +
theme_minimal()
gridExtra::grid.arrange(gender_plt, driving_license_plt,
region_code_plt, previously_insured_plt,
vehicle_age_plt, vehicle_damage_plt,
policy_sales_channel_plt, response_plt,
ncol = 2, nrow = 4)
# Read data_cleaned
df_cleaned <- readRDS("df_cleaned.rds")
# Boxplot
age_boxplot <- df_cleaned %>%
ggplot(aes(x = response, y = age)) +
stat_boxplot(geom = 'errorbar', width = 0.6) +
geom_boxplot(aes(fill = response), show.legend = FALSE) +
labs(title = "Age vs. Response Comparison", y = "Age", x = "Response") +
theme_bw()
ggsave("img/age_boxplot.jpg", plot = age_boxplot, width = 6,
height = 4)
# Histogram
age_plot <- df_cleaned %>%
ggplot(aes(x = age)) +
geom_histogram(binwidth = 1, color = "gray", fill="navy") +
facet_wrap(vars(response), nrow = 2, scales = "free_y") +
labs(y = "Number of clients") +
ggtitle("Age distribution")
age_plot
ggsave("img/age_plot.jpg", plot = age_plot, width = 6,
height = 4)
# Descriptive Statistics
df_cleaned %>%
select(age, response) %>%
tbl_summary(by = response) %>%
add_p()
Characteristic | yes, N = 46,7101 | no, N = 334,3991 | p-value2 |
---|---|---|---|
age | 43 (35, 51) | 34 (24, 49) | <0.001 |
1 Median (IQR) | |||
2 Wilcoxon rank sum test |
Young people seems to be less interested in the car insurance. The median age for intereseted customers is 43 years (IQR: 35, 51), while the median for non-interested customers is 34 years (IQR: 24, 49).
gender_plot <- df_cleaned %>%
select(response, gender) %>%
ggplot(aes(x = response)) +
geom_bar(aes(fill = gender), position = 'dodge') +
labs(title = "Gender vs. Response",
x = "Response", y = "Number of customers") +
theme_bw()
ggsave("img/gender_plot.png", plot = gender_plot)
df_cleaned %>%
select(response, gender) %>%
tbl_summary(by = response) %>%
add_p()
Characteristic | yes, N = 46,7101 | no, N = 334,3991 | p-value2 |
---|---|---|---|
gender | <0.001 | ||
female | 18,185 (39%) | 156,835 (47%) | |
male | 28,525 (61%) | 177,564 (53%) | |
1 n (%) | |||
2 Pearson’s Chi-squared test |
For customers interested in the car insurance, 61% were men, and 39% were women. So, this hypothesis is FALSE. Although, gender and response are statistically significant, i. e., are related.
car_age_plot <- df_cleaned %>%
select(response, vehicle_age) %>%
ggplot(aes(x = response)) +
geom_bar(aes(fill = vehicle_age), position = 'dodge') +
labs(title = "Vehicle Age vs. Response",
x = "Response", y = "Number of customers") +
theme_bw()
ggsave("img/car_age_plot.jpg", plot = car_age_plot)
df_cleaned %>%
select(response, vehicle_age) %>%
tbl_summary(by = response) %>%
add_p()
Characteristic | yes, N = 46,7101 | no, N = 334,3991 | p-value2 |
---|---|---|---|
vehicle_age | <0.001 | ||
below_1_year | 7,202 (15%) | 157,584 (47%) | |
between_1_2_years | 34,806 (75%) | 165,510 (49%) | |
over_2_years | 4,702 (10%) | 11,305 (3.4%) | |
1 n (%) | |||
2 Pearson’s Chi-squared test |
Customers with cars aged between 1 and 2 years are more likely to be interested in the car insurance (75%). While, only 15% of the interested customers have newer cars.
car_damage_plot <- df_cleaned %>%
select(response, vehicle_damage) %>%
ggplot(aes(x = response)) +
geom_bar(aes(fill = vehicle_damage), position = 'dodge') +
labs(title = "Vehicle Damage vs. Response",
x = "Response", y = "Number of customers") +
theme_bw()
car_damage_plot
ggsave("img/car_damage_plot.jpg", plot = car_damage_plot)
df_cleaned %>%
select(response, vehicle_damage) %>%
tbl_summary(by = response) %>%
add_p()
Characteristic | yes, N = 46,7101 | no, N = 334,3991 | p-value2 |
---|---|---|---|
vehicle_damage | 45,728 (98%) | 146,685 (44%) | <0.001 |
1 n (%) | |||
2 Pearson’s Chi-squared test |
Customers with previous car damage are likely to be interested in the car insurance, as 98% said yes.
car_insurance_plot <- df_cleaned %>%
select(response, previously_insured) %>%
ggplot(aes(x = response)) +
geom_bar(aes(fill = previously_insured), position = 'dodge') +
labs(title = "Vehicle Previously Insured vs. Response",
x = "Response", y = "Number of customers") +
theme_bw()
car_insurance_plot
ggsave("img/car_insurance_plot.jpg", plot = car_insurance_plot)
df_cleaned %>%
select(response, previously_insured) %>%
tbl_summary(by = response) %>%
add_p()
Characteristic | yes, N = 46,7101 | no, N = 334,3991 | p-value2 |
---|---|---|---|
previously_insured | 158 (0.3%) | 174,470 (52%) | <0.001 |
1 n (%) | |||
2 Pearson’s Chi-squared test |
Only 0.3% of customers interested in the car insurance have the car previously insured.
# Boxplot
expenses_boxplot <- df_cleaned %>%
ggplot(aes(x = response, y = health_annual_paid)) +
stat_boxplot(geom = 'errorbar', width = 0.6) +
geom_boxplot(aes(fill = response), show.legend = FALSE) +
labs(title = "Annual Payment vs. Response Comparison",
y = "Annual Payment", x = "Response") +
theme_bw()
ggsave("img/expenses_boxplot.jpg",
plot = expenses_boxplot, width = 6,
height = 4)
# Histogram
expenses_plot <- df_cleaned %>%
ggplot(aes(x = health_annual_paid)) +
geom_histogram(binwidth = 10000, color = "gray", fill="navy") +
facet_wrap(vars(response), nrow = 2, scales = "free_y") +
labs(y = "Number of clients") +
ggtitle("Expenses distribution")
expenses_plot
ggsave("img/expenses_plot.jpg", plot = expenses_plot,
width = 6,
height = 4)
# Descriptive Statistics
df_cleaned %>%
select(response, health_annual_paid) %>%
tbl_summary(by = response) %>%
add_p()
Characteristic | yes, N = 46,7101 | no, N = 334,3991 | p-value2 |
---|---|---|---|
health_annual_paid | 33,002 (24,868, 41,297) | 31,504 (24,351, 39,120) | <0.001 |
1 Median (IQR) | |||
2 Wilcoxon rank sum test |
Although the health annual paid showed to be significant, we consider this hypotheses FALSE, and will further investigate the outliers.
# Boxplot
days_boxplot <- df_cleaned %>%
ggplot(aes(x = response, y = days_associated)) +
stat_boxplot(geom = 'errorbar', width = 0.6) +
geom_boxplot(aes(fill = response), show.legend = FALSE) +
labs(title = "Days associated vs. Response Comparison",
y = "Days", x = "Response") +
theme_bw()
ggsave("img/days_boxplot.jpg",
plot = days_boxplot, width = 6,
height = 4)
# Histogram
days_plot <- df_cleaned %>%
ggplot(aes(x = days_associated)) +
geom_histogram(binwidth = 30, color = "gray", fill="navy") +
facet_wrap(vars(response), nrow = 2, scales = "free_y") +
labs(y = "Number of clients") +
ggtitle("Days associated distribution")
days_plot
ggsave("img/days_plot.jpg", plot = days_plot,
width = 6,
height = 4)
df_cleaned %>%
select(response, days_associated) %>%
tbl_summary(by = response) %>%
add_p()
Characteristic | yes, N = 46,7101 | no, N = 334,3991 | p-value2 |
---|---|---|---|
days_associated | 154 (82, 226) | 154 (82, 227) | 0.5 |
1 Median (IQR) | |||
2 Wilcoxon rank sum test |
This hypothesis is FALSE, basically the interested customers and non-interested customers have the same amount of days associated. Yes (median: 154 days, IQR: 82, 226); No (154 days, IQR: 82, 227).
hypothesis <- c(
"H1) Customers with HIGHER AGE are more likely to be interested in the car insurance.",
"H2) Women are likely to be interested in the car insurance.",
"H3) Customers having newer cars are more likely to be interested in the car insurance.",
"H4) Customers with previous car damage are more likely to accept the car insurance",
"H5) Customers with previous car insurance are more likely to accept the car insurance",
"H6) Interest is greater in customers with HIGHER ANNUAL HEALTH expenses.",
"H7) Customers with health insurance for MORE TIME are more likely to be interested in car insurance "
)
conclusion <- c(
"True",
"False",
"False",
"True",
"False",
"False",
"False"
)
relevance <- c(
"High",
"Medium",
"High",
"High",
"High",
"Low",
"Low"
)
hypothesis_table <- tibble(
hypothesis = hypothesis,
conclusion = conclusion,
relevance = relevance
)
kable(hypothesis_table, format = "html") %>%
kableExtra::kable_styling(bootstrap_options = "striped",
full_width = FALSE)
hypothesis | conclusion | relevance |
---|---|---|
H1) Customers with HIGHER AGE are more likely to be interested in the car insurance. | True | High |
H2) Women are likely to be interested in the car insurance. | False | Medium |
H3) Customers having newer cars are more likely to be interested in the car insurance. | False | High |
H4) Customers with previous car damage are more likely to accept the car insurance | True | High |
H5) Customers with previous car insurance are more likely to accept the car insurance | False | High |
H6) Interest is greater in customers with HIGHER ANNUAL HEALTH expenses. | False | Low |
H7) Customers with health insurance for MORE TIME are more likely to be interested in car insurance | False | Low |
Correlation Matrix
correlation_matrix <- df_cleaned %>%
select(age, days_associated, health_annual_paid) %>%
cor() %>%
round(3)
melted_correlation <- melt(correlation_matrix)
matrix_correlation_plot <- ggplot(melted_correlation, aes(x = Var1, y = Var2, fill = value)) +
geom_tile() +
geom_text(aes(Var1, Var2, label = value)) +
scale_fill_gradient2(low = "blue", high = "red",
limit = c(-1, 1), name = "Correlation") +
theme(axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.background = element_blank())
matrix_correlation_plot
ggsave("img/matrix_correlation_plot.jpg", matrix_correlation_plot,
width = 8, height = 4)
There is no strong correlation among numerical variables.
Frequency encoding for
policy_sales_channel
| Build Function
✅
Target encoding for gender
e
region_code
| Build Function
✅
# For gender
gender_encoder <- df_cleaned %>%
mutate(response_num = ifelse(response == "yes", 1, 0)) %>%
group_by(gender) %>%
summarise(gender_num = mean(response_num, na.rm = TRUE))
# For region_code
region_encoder <- df_cleaned %>%
mutate(response_num = ifelse(response == "yes", 1, 0)) %>%
group_by(region_code) %>%
summarise(region_num = mean(response_num, na.rm = TRUE))
# Save in rds
saveRDS(gender_encoder, "gender_encoder.rds")
saveRDS(region_encoder, "region_encoder.rds")
policy_encoder <- df_cleaned %>%
group_by(policy_sales_channel) %>%
summarise(policy_num = n()/nrow(df_cleaned))
# Save in rds
saveRDS(policy_encoder, "policy_encoder.rds")
Using the created encoders in our dataset:
# Create function
encoder_function <- function(df){
df %>%
left_join(gender_encoder) %>%
select(-gender) %>%
rename(gender = gender_num) %>%
left_join(region_encoder) %>%
select(-region_code) %>%
rename(region_code = region_num) %>%
left_join(policy_encoder) %>%
select(-policy_sales_channel) %>%
rename(policy_sales_channel = policy_num)
}
# Save function in rds
saveRDS(encoder_function, "encoder_function.rds")
encoder_function <- readRDS("encoder_function.rds")
df_preprocessed <- encoder_function(df_cleaned)
set.seed(123)
df_split <- df_preprocessed %>%
initial_split(prop = 0.75, strata = response)
df_train <- df_split %>%
training()
df_test <- df_split %>%
testing()
Check response proportions:
df_train %>%
select(response) %>%
tbl_summary(type = list(response ~ "categorical"))
Characteristic | N = 285,8311 |
---|---|
response | |
yes | 35,032 (12%) |
no | 250,799 (88%) |
1 n (%) |
df_test %>%
select(response) %>%
tbl_summary(type = list(response ~ "categorical"))
Characteristic | N = 95,2781 |
---|---|
response | |
yes | 11,678 (12%) |
no | 83,600 (88%) |
1 n (%) |
Using tidymodels steps to continue the preprocessing:
df_recipe <- recipe(response ~.,
data = df_train) %>%
step_normalize(age, days_associated) %>%
step_scale(health_annual_paid) %>%
step_dummy(all_nominal(), -all_outcomes())
# Train the recipe
df_train_prep <- df_recipe %>%
prep(training = df_train)
df_train_processed <- df_train_prep %>%
bake(new_data = df_train)
# For the test dataset
df_test_processed <- df_train_prep %>%
bake(new_data = df_test)
predictors <- df_train_processed %>%
select(-id, -response)
target <- df_train_processed$response
# Check raw numbers
length(target)
## [1] 285831
nrow(predictors)
## [1] 285831
start_time <- Sys.time()
rf_model <- randomForest(predictors, target, ntree = 10,
importance = TRUE)
end_time <- Sys.time()
print(end_time - start_time)
## Time difference of 10.72002 secs
importance(rf_model) %>% View()
varImpPlot(rf_model)
In this first cycle we are going to select the seven most important variables according to the Mean Decrease Gini:
importance(rf_model) %>%
as_tibble(rownames = "rowname") %>%
arrange(desc(MeanDecreaseGini)) %>%
select(rowname) %>%
slice(1:7) %>%
pull()
## [1] "vehicle_damage_no" "days_associated" "age"
## [4] "health_annual_paid" "previously_insured_no" "policy_sales_channel"
## [7] "region_code"
selected_columns <- c(
"age",
"vehicle_damage",
"days_associated",
"previously_insured",
"health_annual_paid",
"policy_sales_channel",
"region_code",
"response"
)
# Final dataset
df_selected <- df_cleaned %>%
select(all_of(selected_columns)) %>% names()
saveRDS(df_selected, "df_selected.rds")