1 Imports

library(tidyverse)
library(janitor)
library(readr)
library(gtsummary)
library(summarytools)
library(kableExtra)
library(knitr)
library(gridExtra)
library(summarytools)
library(randomForest)
library(reshape2)
library(tidymodels)

2 Data Collection

df <- read_csv("datasets/train.csv")
glimpse(df)

## Rows: 381,109
## Columns: 12
## $ id                   <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
## $ Gender               <chr> "Male", "Male", "Male", "Male", "Female", "Female…
## $ Age                  <dbl> 44, 76, 47, 21, 29, 24, 23, 56, 24, 32, 47, 24, 4…
## $ Driving_License      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ Region_Code          <dbl> 28, 3, 28, 11, 41, 33, 11, 28, 3, 6, 35, 50, 15, …
## $ Previously_Insured   <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0…
## $ Vehicle_Age          <chr> "> 2 Years", "1-2 Year", "> 2 Years", "< 1 Year",…
## $ Vehicle_Damage       <chr> "Yes", "No", "Yes", "No", "No", "Yes", "Yes", "Ye…
## $ Annual_Premium       <dbl> 40454, 33536, 38294, 28619, 27496, 2630, 23367, 3…
## $ Policy_Sales_Channel <dbl> 26, 26, 26, 152, 152, 160, 152, 26, 152, 152, 124…
## $ Vintage              <dbl> 217, 183, 27, 203, 39, 176, 249, 72, 28, 80, 46, …
## $ Response             <dbl> 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0…

3 Data Cleaning

df1 <- janitor::clean_names(df) %>% 
  rename(days_associated = vintage, 
         health_annual_paid = annual_premium) %>% 
  mutate(
    across(where(is.character), tolower),
    driving_license = ifelse(driving_license == 1, "yes", "no"),
    previously_insured = ifelse(previously_insured == 1, "yes", "no"),
    response = ifelse(response == 1, "yes", "no"),
    vehicle_age = case_when(
      vehicle_age == "< 1 year" ~ "below_1_year",
      vehicle_age == "1-2 year" ~ "between_1_2_years",
      vehicle_age == "> 2 years" ~ "over_2_years"
    )
  ) %>% 
  mutate_if(is.character, as.factor) %>%
  mutate(response = factor(response, levels = c("yes", "no")),
         driving_license = factor(driving_license, levels = c("yes", "no")),
         previously_insured = factor(previously_insured, levels = c("yes", "no")),
         vehicle_damage = factor(vehicle_damage, levels = c("yes", "no"))
         )

glimpse(df1)

## Rows: 381,109
## Columns: 12
## $ id                   <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
## $ gender               <fct> male, male, male, male, female, female, male, fem…
## $ age                  <dbl> 44, 76, 47, 21, 29, 24, 23, 56, 24, 32, 47, 24, 4…
## $ driving_license      <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes,…
## $ region_code          <dbl> 28, 3, 28, 11, 41, 33, 11, 28, 3, 6, 35, 50, 15, …
## $ previously_insured   <fct> no, no, no, yes, yes, no, no, no, yes, yes, no, y…
## $ vehicle_age          <fct> over_2_years, between_1_2_years, over_2_years, be…
## $ vehicle_damage       <fct> yes, no, yes, no, no, yes, yes, yes, no, no, yes,…
## $ health_annual_paid   <dbl> 40454, 33536, 38294, 28619, 27496, 2630, 23367, 3…
## $ policy_sales_channel <dbl> 26, 26, 26, 152, 152, 160, 152, 26, 152, 152, 124…
## $ days_associated      <dbl> 217, 183, 27, 203, 39, 176, 249, 72, 28, 80, 46, …
## $ response             <fct> yes, no, yes, no, no, no, no, yes, no, no, yes, n…

# save df_cleaned as RDS
saveRDS(df1, "df_cleaned.rds")

3.1 Data Types

variable_classes <- tibble(variables = names(df1),
       type = unlist(lapply(df1, class)))
variable_classes

4 Column Description

variables <- df1 %>% names()
description <- c(
  "Unique ID for the customer",
  "Gender of the customer",
  "Age of the customer",
  "Customer has DL (yes/no)",
  "Unique code for the region of the customer",
  "Customer already has Vehicle Insurance (yes/no)",
  "Age of the Vehicle",
  "Customer got his/her vehicle damaged in the past (yes/no)",
  "The amount customer needs to pay as premium in the year",
  "Anonymized Code for the channel of outreaching to the customer ie. Different Agents, Over Mail, Over Phone, In Person, etc.",
  "Number of Days, Customer has been associated with the company",
  "Customer is interested in car insurance (yes/no)"
)
df_description <- tibble(variables = variables,
       description = description)

kable(df_description, format = "html") %>% 
  kableExtra::kable_styling(bootstrap_options = "striped",
                            full_width = FALSE)

variables	description
id	Unique ID for the customer
gender	Gender of the customer
age	Age of the customer
driving_license	Customer has DL (yes/no)
region_code	Unique code for the region of the customer
previously_insured	Customer already has Vehicle Insurance (yes/no)
vehicle_age	Age of the Vehicle
vehicle_damage	Customer got his/her vehicle damaged in the past (yes/no)
health_annual_paid	The amount customer needs to pay as premium in the year
policy_sales_channel	Anonymized Code for the channel of outreaching to the customer ie. Different Agents, Over Mail, Over Phone, In Person, etc.
days_associated	Number of Days, Customer has been associated with the company
response	Customer is interested in car insurance (yes/no)

5 Descriptive Statistics

# Read cleaned data
df_cleaned <- readRDS("df_cleaned.rds")
glimpse(df_cleaned)

## Rows: 381,109
## Columns: 12
## $ id                   <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
## $ gender               <fct> male, male, male, male, female, female, male, fem…
## $ age                  <dbl> 44, 76, 47, 21, 29, 24, 23, 56, 24, 32, 47, 24, 4…
## $ driving_license      <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes,…
## $ region_code          <dbl> 28, 3, 28, 11, 41, 33, 11, 28, 3, 6, 35, 50, 15, …
## $ previously_insured   <fct> no, no, no, yes, yes, no, no, no, yes, yes, no, y…
## $ vehicle_age          <fct> over_2_years, between_1_2_years, over_2_years, be…
## $ vehicle_damage       <fct> yes, no, yes, no, no, yes, yes, yes, no, no, yes,…
## $ health_annual_paid   <dbl> 40454, 33536, 38294, 28619, 27496, 2630, 23367, 3…
## $ policy_sales_channel <dbl> 26, 26, 26, 152, 152, 160, 152, 26, 152, 152, 124…
## $ days_associated      <dbl> 217, 183, 27, 203, 39, 176, 249, 72, 28, 80, 46, …
## $ response             <fct> yes, no, yes, no, no, no, no, yes, no, no, yes, n…

Check data structure so far:

skimr::skim(df_cleaned)

Data summary
Name	df_cleaned
Number of rows	381109
Number of columns	12
_______________________
Column type frequency:
factor	6
numeric	6
________________________
Group variables	None

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
gender	1	FALSE	2	mal: 206089, fem: 175020
driving_license	1	FALSE	2	yes: 380297, no: 812
previously_insured	1	FALSE	2	no: 206481, yes: 174628
vehicle_age	1	FALSE	3	bet: 200316, bel: 164786, ove: 16007
vehicle_damage	1	FALSE	2	yes: 192413, no: 188696
response	1	FALSE	2	no: 334399, yes: 46710

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
id	1	190555.00	110016.84	1	95278	190555	285832	381109	▇▇▇▇▇
age	1	38.82	15.51	20	25	36	49	85	▇▃▃▂▁
region_code	1	26.39	13.23	0	15	28	35	52	▃▂▇▃▃
health_annual_paid	1	30564.39	17213.16	2630	24405	31669	39400	540165	▇▁▁▁▁
policy_sales_channel	1	112.03	54.20	1	29	133	152	163	▅▁▁▃▇
days_associated	1	154.35	83.67	10	82	154	227	299	▇▇▇▇▇

5.1 General overview

df_cleaned %>% 
  select(-id) %>% 
  tbl_summary(
    type = list(response ~ "categorical",
                driving_license ~ "categorical", 
                previously_insured ~ "categorical",
                vehicle_damage ~ "categorical"),
    digits = list(all_categorical() ~ c(0, 2))
  )

Characteristic	N = 381,109¹
gender
female	175,020 (45.92%)
male	206,089 (54.08%)
age	36 (25, 49)
driving_license
yes	380,297 (99.79%)
no	812 (0.21%)
region_code	28 (15, 35)
previously_insured
yes	174,628 (45.82%)
no	206,481 (54.18%)
vehicle_age
below_1_year	164,786 (43.24%)
between_1_2_years	200,316 (52.56%)
over_2_years	16,007 (4.20%)
vehicle_damage
yes	192,413 (50.49%)
no	188,696 (49.51%)
health_annual_paid	31,669 (24,405, 39,400)
policy_sales_channel	133 (29, 152)
days_associated	154 (82, 227)
response
yes	46,710 (12.26%)
no	334,399 (87.74%)
¹ n (%); Median (IQR)

5.2 More detailed statistics

num_attributes <- df_cleaned %>% 
  select(age, health_annual_paid, days_associated)

descriptive_tab <- summarytools::descr(num_attributes, style = "rmarkdown") %>% round(2)

## Error : Can't find summarytools

kable(data.frame(descriptive_tab), format = "html") %>% 
  kableExtra::kable_styling(bootstrap_options = "striped",
                            full_width = FALSE)

	age	days_associated	health_annual_paid
Mean	38.82	154.35	30564.39
Std.Dev	15.51	83.67	17213.16
Min	20.00	10.00	2630.00
Q1	25.00	82.00	24405.00
Median	36.00	154.00	31669.00
Q3	49.00	227.00	39400.00
Max	85.00	299.00	540165.00
MAD	17.79	108.23	11125.43
IQR	24.00	145.00	14995.00
CV	0.40	0.54	0.56
Skewness	0.67	0.00	1.77
SE.Skewness	0.00	0.00	0.00
Kurtosis	-0.57	-1.20	34.00
N.Valid	381109.00	381109.00	381109.00
Pct.Valid	100.00	100.00	100.00

5.3 Visualization

Numerical attributes

# Age ---------------------
age_plt <- num_attributes %>% 
  ggplot(aes(x = age)) + 
  geom_histogram(aes(y = after_stat(density)), binwidth = 1,
                 color = "gray", fill = "lightblue", alpha = 0.5) + geom_density(color = "blue") +
  labs(x = "Age", y = "Density", title = "Customers \nAge Distribution") +
  theme_minimal()

#  health_annual_paid ---------------------
paid_plt <- num_attributes %>% 
  ggplot(aes(x = health_annual_paid)) + 
  geom_histogram(aes(y = after_stat(density)),
                 binwidth = 10000,
                 color = "gray", fill = "lightblue", alpha = 0.5) + geom_density(color = "blue") +
  labs(x = "Health Annual Paid", y = "Density", title = "Customers \nPayments Distribution") +
  theme_minimal()

#  days_associated ---------------------
days_plt <- num_attributes %>% 
  ggplot(aes(x = days_associated)) + 
  geom_histogram(aes(y = after_stat(density)),
                 color = "gray", fill = "lightblue", alpha = 0.5) + geom_density(color = "blue") +
  labs(x = "Days Associated", y = "Density", title = "Customers Days \nAssociated \nDistribution") +
  theme_minimal()


gridExtra::grid.arrange(age_plt, paid_plt, days_plt, ncol = 3)

Categorical attributes:

num_names <- names(num_attributes)
cat_attributes <- df_cleaned %>% 
  select(-id, -one_of(num_names))

gender_plt <- cat_attributes %>% 
  ggplot(aes(x = gender)) +
  geom_bar(aes(fill = gender)) +
  labs(x = "Gender", y = "#", 
       title = "Customers Gender") +
  theme_minimal()

driving_license_plt <- cat_attributes %>% 
  ggplot(aes(x = driving_license)) +
  geom_bar(aes(fill = driving_license),
           show.legend = FALSE) +
  labs(x = "Driving License", y = "#", 
       title = "Customers \nDriving License") +
  theme_minimal()

region_code_plt <- cat_attributes %>% 
  ggplot(aes(x = region_code)) +
  geom_bar(aes(fill = factor(region_code)),
           show.legend = FALSE) +
  labs(x = "Region Code", y = "#", 
       title = "Customers \nRegion Code") +
  theme_minimal()

previously_insured_plt <- cat_attributes %>% 
  ggplot(aes(x = previously_insured)) +
  geom_bar(aes(fill = previously_insured),
           show.legend = FALSE) +
  labs(x = "Previously Insured", y = "#", 
       title = "Customers \nPreviously Insured") +
  theme_minimal()

vehicle_age_plt <- cat_attributes %>% 
  ggplot(aes(x = vehicle_age)) +
  geom_bar(aes(fill = vehicle_age),
           show.legend = FALSE) +
  labs(x = "vehicle_age", y = "#", 
       title = "Customers \nVehicle Age") +
  theme_minimal()

vehicle_damage_plt <- cat_attributes %>% 
  ggplot(aes(x = vehicle_damage)) +
  geom_bar(aes(fill = vehicle_damage),
           show.legend = FALSE) +
  labs(x = "vehicle_damage", y = "#", 
       title = "Customers \nVehicle Damage") +
  theme_minimal()

policy_sales_channel_plt <- cat_attributes %>% 
  ggplot(aes(x = policy_sales_channel)) +
  geom_bar(aes(fill = factor(policy_sales_channel)),
           show.legend = FALSE) +
  labs(x = "policy_sales_channel", y = "#", 
       title = "Customers \nPolicy Sales Channel") +
  theme_minimal()

response_plt <- cat_attributes %>% 
  ggplot(aes(x = response)) +
  geom_bar(aes(fill = response),
           show.legend = FALSE) +
  labs(x = "response", y = "#", 
       title = "Customers response") +
  theme_minimal()       
   
gridExtra::grid.arrange(gender_plt, driving_license_plt,
                        region_code_plt, previously_insured_plt,
                        vehicle_age_plt, vehicle_damage_plt, 
                        policy_sales_channel_plt, response_plt,
                        ncol = 2, nrow = 4)

6 Hypothesis validation

# Read data_cleaned
df_cleaned <- readRDS("df_cleaned.rds")

6.1 H1) Customers with HIGHER AGE are more likely to be interested in the car insurance. ✅

# Boxplot
age_boxplot <- df_cleaned %>% 
  ggplot(aes(x = response, y = age)) +
  stat_boxplot(geom = 'errorbar', width = 0.6) + 
  geom_boxplot(aes(fill = response), show.legend = FALSE) + 
  labs(title = "Age vs. Response Comparison", y = "Age", x = "Response") +
  theme_bw()
 
ggsave("img/age_boxplot.jpg", plot = age_boxplot, width = 6, 
       height = 4) 



# Histogram 
age_plot <- df_cleaned %>% 
  ggplot(aes(x = age)) +
  geom_histogram(binwidth = 1, color = "gray", fill="navy") +
  facet_wrap(vars(response), nrow = 2, scales = "free_y") +
  labs(y = "Number of clients") + 
  ggtitle("Age distribution")

age_plot

ggsave("img/age_plot.jpg", plot = age_plot, width = 6, 
       height = 4)

# Descriptive Statistics
df_cleaned %>% 
  select(age, response) %>% 
  tbl_summary(by = response) %>% 
  add_p()

Characteristic	yes, N = 46,710¹	no, N = 334,399¹	p-value²
age	43 (35, 51)	34 (24, 49)	<0.001
¹ Median (IQR)
² Wilcoxon rank sum test

Young people seems to be less interested in the car insurance. The median age for intereseted customers is 43 years (IQR: 35, 51), while the median for non-interested customers is 34 years (IQR: 24, 49).

6.2 H2) Women are likely to be interested in the car insurance. ❌

gender_plot <- df_cleaned %>% 
  select(response, gender) %>% 
  ggplot(aes(x = response)) +
  geom_bar(aes(fill = gender), position = 'dodge') +
  labs(title = "Gender vs. Response", 
       x = "Response", y = "Number of customers") +
  theme_bw()

ggsave("img/gender_plot.png", plot = gender_plot)

df_cleaned %>% 
  select(response, gender) %>% 
  tbl_summary(by = response) %>% 
  add_p()

Characteristic	yes, N = 46,710¹	no, N = 334,399¹	p-value²
gender			<0.001
female	18,185 (39%)	156,835 (47%)
male	28,525 (61%)	177,564 (53%)
¹ n (%)
² Pearson’s Chi-squared test

For customers interested in the car insurance, 61% were men, and 39% were women. So, this hypothesis is FALSE. Although, gender and response are statistically significant, i. e., are related.

6.3 H3) Customers having newer cars are more likely to be interested in the car insurance. ❌

car_age_plot <- df_cleaned %>% 
  select(response, vehicle_age) %>% 
  ggplot(aes(x = response)) +
  geom_bar(aes(fill = vehicle_age), position = 'dodge') +
  labs(title = "Vehicle Age vs. Response", 
       x = "Response", y = "Number of customers") +
  theme_bw()

ggsave("img/car_age_plot.jpg", plot = car_age_plot)

df_cleaned %>% 
  select(response, vehicle_age) %>% 
  tbl_summary(by = response) %>% 
  add_p()

Characteristic	yes, N = 46,710¹	no, N = 334,399¹	p-value²
vehicle_age			<0.001
below_1_year	7,202 (15%)	157,584 (47%)
between_1_2_years	34,806 (75%)	165,510 (49%)
over_2_years	4,702 (10%)	11,305 (3.4%)
¹ n (%)
² Pearson’s Chi-squared test

Customers with cars aged between 1 and 2 years are more likely to be interested in the car insurance (75%). While, only 15% of the interested customers have newer cars.

6.4 H4) Customers with previous car damage are more likely to accept the car insurance ✅

car_damage_plot <- df_cleaned %>% 
  select(response, vehicle_damage) %>% 
  ggplot(aes(x = response)) +
  geom_bar(aes(fill = vehicle_damage), position = 'dodge') +
  labs(title = "Vehicle Damage vs. Response", 
       x = "Response", y = "Number of customers") +
  theme_bw()

car_damage_plot

ggsave("img/car_damage_plot.jpg", plot = car_damage_plot)

df_cleaned %>% 
  select(response, vehicle_damage) %>% 
  tbl_summary(by = response) %>% 
  add_p()

Characteristic	yes, N = 46,710¹	no, N = 334,399¹	p-value²
vehicle_damage	45,728 (98%)	146,685 (44%)	<0.001
¹ n (%)
² Pearson’s Chi-squared test

Customers with previous car damage are likely to be interested in the car insurance, as 98% said yes.

6.5 H5) Customers with previous car insurance are more likely to accept the car insurance ❌

car_insurance_plot <- df_cleaned %>% 
  select(response, previously_insured) %>% 
  ggplot(aes(x = response)) +
  geom_bar(aes(fill = previously_insured), position = 'dodge') +
  labs(title = "Vehicle Previously Insured vs. Response", 
       x = "Response", y = "Number of customers") +
  theme_bw()
car_insurance_plot

ggsave("img/car_insurance_plot.jpg", plot = car_insurance_plot)

df_cleaned %>% 
  select(response, previously_insured) %>% 
  tbl_summary(by = response) %>% 
  add_p()

Characteristic	yes, N = 46,710¹	no, N = 334,399¹	p-value²
previously_insured	158 (0.3%)	174,470 (52%)	<0.001
¹ n (%)
² Pearson’s Chi-squared test

Only 0.3% of customers interested in the car insurance have the car previously insured.

6.6 H6) Interest is greater in customers with HIGHER ANNUAL HEALTH expenses. ❌

# Boxplot
expenses_boxplot <- df_cleaned %>% 
  ggplot(aes(x = response, y = health_annual_paid)) +
  stat_boxplot(geom = 'errorbar', width = 0.6) + 
  geom_boxplot(aes(fill = response), show.legend = FALSE) + 
  labs(title = "Annual Payment vs. Response Comparison", 
       y = "Annual Payment", x = "Response") +
  theme_bw()
 
ggsave("img/expenses_boxplot.jpg", 
       plot = expenses_boxplot, width = 6, 
       height = 4) 

# Histogram 
expenses_plot <- df_cleaned %>% 
  ggplot(aes(x = health_annual_paid)) +
  geom_histogram(binwidth = 10000, color = "gray", fill="navy") +
  facet_wrap(vars(response), nrow = 2, scales = "free_y") +
  labs(y = "Number of clients") + 
  ggtitle("Expenses distribution")

expenses_plot

ggsave("img/expenses_plot.jpg", plot = expenses_plot, 
       width = 6, 
       height = 4)

# Descriptive Statistics
df_cleaned %>% 
  select(response, health_annual_paid) %>% 
  tbl_summary(by = response) %>% 
  add_p()

Characteristic	yes, N = 46,710¹	no, N = 334,399¹	p-value²
health_annual_paid	33,002 (24,868, 41,297)	31,504 (24,351, 39,120)	<0.001
¹ Median (IQR)
² Wilcoxon rank sum test

Although the health annual paid showed to be significant, we consider this hypotheses FALSE, and will further investigate the outliers.

6.7 H7) Customers with health insurance for MORE TIME are more likely to be interested in car insurance ❌

# Boxplot
days_boxplot <- df_cleaned %>% 
  ggplot(aes(x = response, y = days_associated)) +
  stat_boxplot(geom = 'errorbar', width = 0.6) + 
  geom_boxplot(aes(fill = response), show.legend = FALSE) + 
  labs(title = "Days associated vs. Response Comparison", 
       y = "Days", x = "Response") +
  theme_bw()
 
ggsave("img/days_boxplot.jpg", 
       plot = days_boxplot, width = 6, 
       height = 4) 

# Histogram 
days_plot <- df_cleaned %>% 
  ggplot(aes(x = days_associated)) +
  geom_histogram(binwidth = 30, color = "gray", fill="navy") +
  facet_wrap(vars(response), nrow = 2, scales = "free_y") +
  labs(y = "Number of clients") + 
  ggtitle("Days associated distribution")

days_plot

ggsave("img/days_plot.jpg", plot = days_plot, 
       width = 6, 
       height = 4)

df_cleaned %>% 
  select(response, days_associated) %>% 
  tbl_summary(by = response) %>% 
  add_p()

Characteristic	yes, N = 46,710¹	no, N = 334,399¹	p-value²
days_associated	154 (82, 226)	154 (82, 227)	0.5
¹ Median (IQR)
² Wilcoxon rank sum test

This hypothesis is FALSE, basically the interested customers and non-interested customers have the same amount of days associated. Yes (median: 154 days, IQR: 82, 226); No (154 days, IQR: 82, 227).

6.8 Hypothesis conclusion

hypothesis <- c(
  "H1) Customers with HIGHER AGE are more likely to be interested in the car insurance.", 
  "H2) Women are likely to be interested in the car insurance.",
  "H3) Customers having newer cars are more likely to be interested in the car insurance.",
  "H4) Customers with previous car damage are more likely to accept the car insurance",
  "H5) Customers with previous car insurance are more likely to accept the car insurance",
  "H6) Interest is greater in customers with HIGHER ANNUAL HEALTH expenses.",
  "H7) Customers with health insurance for MORE TIME are more likely to be interested in car insurance "
  )

conclusion <- c(
  "True", 
  "False",
  "False",
  "True",
  "False",
  "False",
  "False"
)

relevance <- c(
  "High", 
  "Medium",
  "High",
  "High",
  "High",
  "Low",
  "Low"
)

hypothesis_table <- tibble(
  hypothesis = hypothesis,
  conclusion = conclusion,
  relevance = relevance
)

kable(hypothesis_table, format = "html") %>% 
  kableExtra::kable_styling(bootstrap_options = "striped",
                            full_width = FALSE)

hypothesis	conclusion	relevance
H1) Customers with HIGHER AGE are more likely to be interested in the car insurance.	True	High
H2) Women are likely to be interested in the car insurance.	False	Medium
H3) Customers having newer cars are more likely to be interested in the car insurance.	False	High
H4) Customers with previous car damage are more likely to accept the car insurance	True	High
H5) Customers with previous car insurance are more likely to accept the car insurance	False	High
H6) Interest is greater in customers with HIGHER ANNUAL HEALTH expenses.	False	Low
H7) Customers with health insurance for MORE TIME are more likely to be interested in car insurance	False	Low

7 Multivariable analysis

Correlation Matrix

correlation_matrix <- df_cleaned %>% 
  select(age, days_associated, health_annual_paid) %>% 
  cor() %>% 
  round(3)

Visualization:

melted_correlation <- melt(correlation_matrix)

matrix_correlation_plot <- ggplot(melted_correlation, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile() +
  geom_text(aes(Var1, Var2, label = value)) +
  scale_fill_gradient2(low = "blue", high = "red",
                       limit = c(-1, 1), name = "Correlation") +
  theme(axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        panel.background = element_blank())

matrix_correlation_plot

ggsave("img/matrix_correlation_plot.jpg", matrix_correlation_plot,
       width = 8, height = 4)

There is no strong correlation among numerical variables.

8 Data Preparation

Frequency encoding for policy_sales_channel | Build Function ✅
Target encoding for gender e region_code | Build Function ✅

8.1 Target encoding

# For gender
gender_encoder <- df_cleaned %>% 
  mutate(response_num = ifelse(response == "yes", 1, 0)) %>% 
  group_by(gender) %>% 
  summarise(gender_num = mean(response_num, na.rm = TRUE))
  
# For region_code  
region_encoder <- df_cleaned %>% 
  mutate(response_num = ifelse(response == "yes", 1, 0)) %>% 
  group_by(region_code) %>% 
  summarise(region_num = mean(response_num, na.rm = TRUE))

# Save in rds
saveRDS(gender_encoder, "gender_encoder.rds")
saveRDS(region_encoder, "region_encoder.rds")

8.2 Frequency encoding

policy_encoder <- df_cleaned %>% 
  group_by(policy_sales_channel) %>% 
  summarise(policy_num = n()/nrow(df_cleaned))

# Save in rds
saveRDS(policy_encoder, "policy_encoder.rds")

Using the created encoders in our dataset:

# Create function
encoder_function <- function(df){
  df %>% 
  left_join(gender_encoder) %>% 
  select(-gender) %>% 
  rename(gender = gender_num) %>% 
  left_join(region_encoder) %>% 
  select(-region_code) %>% 
  rename(region_code = region_num) %>% 
  left_join(policy_encoder) %>% 
  select(-policy_sales_channel) %>% 
  rename(policy_sales_channel = policy_num) 
}

# Save function in rds
saveRDS(encoder_function, "encoder_function.rds")

Splitting into train and test datasets

encoder_function <- readRDS("encoder_function.rds")

df_preprocessed <- encoder_function(df_cleaned)

set.seed(123)

df_split <- df_preprocessed %>% 
  initial_split(prop = 0.75, strata = response)

df_train <- df_split %>% 
  training()

df_test <- df_split %>% 
  testing()

Check response proportions:
```
df_train %>% 
  select(response) %>% 
  tbl_summary(type = list(response ~ "categorical"))
```
Characteristic N = 285,831¹

response

yes 35,032 (12%)

no 250,799 (88%)

¹ n (%)
```
df_test %>% 
  select(response) %>% 
  tbl_summary(type = list(response ~ "categorical"))
```
Characteristic N = 95,278¹

response

yes 11,678 (12%)

no 83,600 (88%)

¹ n (%)
Using tidymodels steps to continue the preprocessing:

Characteristic	N = 285,831¹
response
yes	35,032 (12%)
no	250,799 (88%)
¹ n (%)

Characteristic	N = 95,278¹
response
yes	11,678 (12%)
no	83,600 (88%)
¹ n (%)

df_recipe <- recipe(response ~.,
       data = df_train) %>% 
  step_normalize(age, days_associated) %>% 
  step_scale(health_annual_paid) %>% 
  step_dummy(all_nominal(), -all_outcomes())

Applying the recipe:

# Train the recipe
df_train_prep <- df_recipe %>% 
  prep(training = df_train)

df_train_processed <- df_train_prep %>% 
  bake(new_data = df_train)

# For the test dataset 
df_test_processed <- df_train_prep %>% 
  bake(new_data = df_test)

9 Feature selection

predictors <- df_train_processed %>% 
  select(-id, -response)

target <- df_train_processed$response

# Check raw numbers
length(target)

## [1] 285831

nrow(predictors)

## [1] 285831

start_time <- Sys.time()

rf_model <- randomForest(predictors, target, ntree = 10,
                         importance = TRUE)

end_time <- Sys.time()

print(end_time - start_time)

## Time difference of 10.72002 secs

Show results:

importance(rf_model) %>% View()

varImpPlot(rf_model)

In this first cycle we are going to select the seven most important variables according to the Mean Decrease Gini:

importance(rf_model) %>% 
  as_tibble(rownames = "rowname") %>% 
  arrange(desc(MeanDecreaseGini)) %>% 
  select(rowname) %>% 
  slice(1:7) %>% 
  pull()

## [1] "vehicle_damage_no"     "days_associated"       "age"                  
## [4] "health_annual_paid"    "previously_insured_no" "policy_sales_channel" 
## [7] "region_code"

selected_columns <- c(
  "age",
  "vehicle_damage",
  "days_associated",
  "previously_insured",
  "health_annual_paid", 
  "policy_sales_channel", 
  "region_code",
  "response"
)

# Final dataset
df_selected <- df_cleaned %>% 
  select(all_of(selected_columns)) %>% names()

saveRDS(df_selected, "df_selected.rds")

Health Insurance Cross Sell - Part 01

Edneide Ramalho

14/11/2023