1 Imports

library(tidyverse)
library(janitor)
library(readr)
library(gtsummary)
library(summarytools)
library(kableExtra)
library(knitr)
library(gridExtra)
library(summarytools)
library(randomForest)
library(reshape2)
library(tidymodels)

2 Data Collection

df <- read_csv("datasets/train.csv")
glimpse(df)
## Rows: 381,109
## Columns: 12
## $ id                   <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
## $ Gender               <chr> "Male", "Male", "Male", "Male", "Female", "Female…
## $ Age                  <dbl> 44, 76, 47, 21, 29, 24, 23, 56, 24, 32, 47, 24, 4…
## $ Driving_License      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ Region_Code          <dbl> 28, 3, 28, 11, 41, 33, 11, 28, 3, 6, 35, 50, 15, …
## $ Previously_Insured   <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0…
## $ Vehicle_Age          <chr> "> 2 Years", "1-2 Year", "> 2 Years", "< 1 Year",…
## $ Vehicle_Damage       <chr> "Yes", "No", "Yes", "No", "No", "Yes", "Yes", "Ye…
## $ Annual_Premium       <dbl> 40454, 33536, 38294, 28619, 27496, 2630, 23367, 3…
## $ Policy_Sales_Channel <dbl> 26, 26, 26, 152, 152, 160, 152, 26, 152, 152, 124…
## $ Vintage              <dbl> 217, 183, 27, 203, 39, 176, 249, 72, 28, 80, 46, …
## $ Response             <dbl> 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0…

3 Data Cleaning

df1 <- janitor::clean_names(df) %>% 
  rename(days_associated = vintage, 
         health_annual_paid = annual_premium) %>% 
  mutate(
    across(where(is.character), tolower),
    driving_license = ifelse(driving_license == 1, "yes", "no"),
    previously_insured = ifelse(previously_insured == 1, "yes", "no"),
    response = ifelse(response == 1, "yes", "no"),
    vehicle_age = case_when(
      vehicle_age == "< 1 year" ~ "below_1_year",
      vehicle_age == "1-2 year" ~ "between_1_2_years",
      vehicle_age == "> 2 years" ~ "over_2_years"
    )
  ) %>% 
  mutate_if(is.character, as.factor) %>%
  mutate(response = factor(response, levels = c("yes", "no")),
         driving_license = factor(driving_license, levels = c("yes", "no")),
         previously_insured = factor(previously_insured, levels = c("yes", "no")),
         vehicle_damage = factor(vehicle_damage, levels = c("yes", "no"))
         )

glimpse(df1)
## Rows: 381,109
## Columns: 12
## $ id                   <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
## $ gender               <fct> male, male, male, male, female, female, male, fem…
## $ age                  <dbl> 44, 76, 47, 21, 29, 24, 23, 56, 24, 32, 47, 24, 4…
## $ driving_license      <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes,…
## $ region_code          <dbl> 28, 3, 28, 11, 41, 33, 11, 28, 3, 6, 35, 50, 15, …
## $ previously_insured   <fct> no, no, no, yes, yes, no, no, no, yes, yes, no, y…
## $ vehicle_age          <fct> over_2_years, between_1_2_years, over_2_years, be…
## $ vehicle_damage       <fct> yes, no, yes, no, no, yes, yes, yes, no, no, yes,…
## $ health_annual_paid   <dbl> 40454, 33536, 38294, 28619, 27496, 2630, 23367, 3…
## $ policy_sales_channel <dbl> 26, 26, 26, 152, 152, 160, 152, 26, 152, 152, 124…
## $ days_associated      <dbl> 217, 183, 27, 203, 39, 176, 249, 72, 28, 80, 46, …
## $ response             <fct> yes, no, yes, no, no, no, no, yes, no, no, yes, n…
# save df_cleaned as RDS
saveRDS(df1, "df_cleaned.rds")

3.1 Data Types

variable_classes <- tibble(variables = names(df1),
       type = unlist(lapply(df1, class)))
variable_classes

4 Column Description

variables <- df1 %>% names()
description <- c(
  "Unique ID for the customer",
  "Gender of the customer",
  "Age of the customer",
  "Customer has DL (yes/no)",
  "Unique code for the region of the customer",
  "Customer already has Vehicle Insurance (yes/no)",
  "Age of the Vehicle",
  "Customer got his/her vehicle damaged in the past (yes/no)",
  "The amount customer needs to pay as premium in the year",
  "Anonymized Code for the channel of outreaching to the customer ie. Different Agents, Over Mail, Over Phone, In Person, etc.",
  "Number of Days, Customer has been associated with the company",
  "Customer is interested in car insurance (yes/no)"
)
df_description <- tibble(variables = variables,
       description = description)

kable(df_description, format = "html") %>% 
  kableExtra::kable_styling(bootstrap_options = "striped",
                            full_width = FALSE)
variables description
id Unique ID for the customer
gender Gender of the customer
age Age of the customer
driving_license Customer has DL (yes/no)
region_code Unique code for the region of the customer
previously_insured Customer already has Vehicle Insurance (yes/no)
vehicle_age Age of the Vehicle
vehicle_damage Customer got his/her vehicle damaged in the past (yes/no)
health_annual_paid The amount customer needs to pay as premium in the year
policy_sales_channel Anonymized Code for the channel of outreaching to the customer ie. Different Agents, Over Mail, Over Phone, In Person, etc.
days_associated Number of Days, Customer has been associated with the company
response Customer is interested in car insurance (yes/no)

5 Descriptive Statistics

# Read cleaned data
df_cleaned <- readRDS("df_cleaned.rds")
glimpse(df_cleaned)
## Rows: 381,109
## Columns: 12
## $ id                   <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
## $ gender               <fct> male, male, male, male, female, female, male, fem…
## $ age                  <dbl> 44, 76, 47, 21, 29, 24, 23, 56, 24, 32, 47, 24, 4…
## $ driving_license      <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes,…
## $ region_code          <dbl> 28, 3, 28, 11, 41, 33, 11, 28, 3, 6, 35, 50, 15, …
## $ previously_insured   <fct> no, no, no, yes, yes, no, no, no, yes, yes, no, y…
## $ vehicle_age          <fct> over_2_years, between_1_2_years, over_2_years, be…
## $ vehicle_damage       <fct> yes, no, yes, no, no, yes, yes, yes, no, no, yes,…
## $ health_annual_paid   <dbl> 40454, 33536, 38294, 28619, 27496, 2630, 23367, 3…
## $ policy_sales_channel <dbl> 26, 26, 26, 152, 152, 160, 152, 26, 152, 152, 124…
## $ days_associated      <dbl> 217, 183, 27, 203, 39, 176, 249, 72, 28, 80, 46, …
## $ response             <fct> yes, no, yes, no, no, no, no, yes, no, no, yes, n…
  • Check data structure so far:
skimr::skim(df_cleaned)
Data summary
Name df_cleaned
Number of rows 381109
Number of columns 12
_______________________
Column type frequency:
factor 6
numeric 6
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
gender 0 1 FALSE 2 mal: 206089, fem: 175020
driving_license 0 1 FALSE 2 yes: 380297, no: 812
previously_insured 0 1 FALSE 2 no: 206481, yes: 174628
vehicle_age 0 1 FALSE 3 bet: 200316, bel: 164786, ove: 16007
vehicle_damage 0 1 FALSE 2 yes: 192413, no: 188696
response 0 1 FALSE 2 no: 334399, yes: 46710

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1 190555.00 110016.84 1 95278 190555 285832 381109 ▇▇▇▇▇
age 0 1 38.82 15.51 20 25 36 49 85 ▇▃▃▂▁
region_code 0 1 26.39 13.23 0 15 28 35 52 ▃▂▇▃▃
health_annual_paid 0 1 30564.39 17213.16 2630 24405 31669 39400 540165 ▇▁▁▁▁
policy_sales_channel 0 1 112.03 54.20 1 29 133 152 163 ▅▁▁▃▇
days_associated 0 1 154.35 83.67 10 82 154 227 299 ▇▇▇▇▇

5.1 General overview

df_cleaned %>% 
  select(-id) %>% 
  tbl_summary(
    type = list(response ~ "categorical",
                driving_license ~ "categorical", 
                previously_insured ~ "categorical",
                vehicle_damage ~ "categorical"),
    digits = list(all_categorical() ~ c(0, 2))
  )
Characteristic N = 381,1091
gender
    female 175,020 (45.92%)
    male 206,089 (54.08%)
age 36 (25, 49)
driving_license
    yes 380,297 (99.79%)
    no 812 (0.21%)
region_code 28 (15, 35)
previously_insured
    yes 174,628 (45.82%)
    no 206,481 (54.18%)
vehicle_age
    below_1_year 164,786 (43.24%)
    between_1_2_years 200,316 (52.56%)
    over_2_years 16,007 (4.20%)
vehicle_damage
    yes 192,413 (50.49%)
    no 188,696 (49.51%)
health_annual_paid 31,669 (24,405, 39,400)
policy_sales_channel 133 (29, 152)
days_associated 154 (82, 227)
response
    yes 46,710 (12.26%)
    no 334,399 (87.74%)
1 n (%); Median (IQR)

5.2 More detailed statistics

num_attributes <- df_cleaned %>% 
  select(age, health_annual_paid, days_associated)
descriptive_tab <- summarytools::descr(num_attributes, style = "rmarkdown") %>% round(2)
## Error : Can't find summarytools
kable(data.frame(descriptive_tab), format = "html") %>% 
  kableExtra::kable_styling(bootstrap_options = "striped",
                            full_width = FALSE)
age days_associated health_annual_paid
Mean 38.82 154.35 30564.39
Std.Dev 15.51 83.67 17213.16
Min 20.00 10.00 2630.00
Q1 25.00 82.00 24405.00
Median 36.00 154.00 31669.00
Q3 49.00 227.00 39400.00
Max 85.00 299.00 540165.00
MAD 17.79 108.23 11125.43
IQR 24.00 145.00 14995.00
CV 0.40 0.54 0.56
Skewness 0.67 0.00 1.77
SE.Skewness 0.00 0.00 0.00
Kurtosis -0.57 -1.20 34.00
N.Valid 381109.00 381109.00 381109.00
Pct.Valid 100.00 100.00 100.00

5.3 Visualization

  • Numerical attributes
# Age ---------------------
age_plt <- num_attributes %>% 
  ggplot(aes(x = age)) + 
  geom_histogram(aes(y = after_stat(density)), binwidth = 1,
                 color = "gray", fill = "lightblue", alpha = 0.5) + geom_density(color = "blue") +
  labs(x = "Age", y = "Density", title = "Customers \nAge Distribution") +
  theme_minimal()

#  health_annual_paid ---------------------
paid_plt <- num_attributes %>% 
  ggplot(aes(x = health_annual_paid)) + 
  geom_histogram(aes(y = after_stat(density)),
                 binwidth = 10000,
                 color = "gray", fill = "lightblue", alpha = 0.5) + geom_density(color = "blue") +
  labs(x = "Health Annual Paid", y = "Density", title = "Customers \nPayments Distribution") +
  theme_minimal()

#  days_associated ---------------------
days_plt <- num_attributes %>% 
  ggplot(aes(x = days_associated)) + 
  geom_histogram(aes(y = after_stat(density)),
                 color = "gray", fill = "lightblue", alpha = 0.5) + geom_density(color = "blue") +
  labs(x = "Days Associated", y = "Density", title = "Customers Days \nAssociated \nDistribution") +
  theme_minimal()


gridExtra::grid.arrange(age_plt, paid_plt, days_plt, ncol = 3)

  • Categorical attributes:
num_names <- names(num_attributes)
cat_attributes <- df_cleaned %>% 
  select(-id, -one_of(num_names)) 
gender_plt <- cat_attributes %>% 
  ggplot(aes(x = gender)) +
  geom_bar(aes(fill = gender)) +
  labs(x = "Gender", y = "#", 
       title = "Customers Gender") +
  theme_minimal()

driving_license_plt <- cat_attributes %>% 
  ggplot(aes(x = driving_license)) +
  geom_bar(aes(fill = driving_license),
           show.legend = FALSE) +
  labs(x = "Driving License", y = "#", 
       title = "Customers \nDriving License") +
  theme_minimal()

region_code_plt <- cat_attributes %>% 
  ggplot(aes(x = region_code)) +
  geom_bar(aes(fill = factor(region_code)),
           show.legend = FALSE) +
  labs(x = "Region Code", y = "#", 
       title = "Customers \nRegion Code") +
  theme_minimal()

previously_insured_plt <- cat_attributes %>% 
  ggplot(aes(x = previously_insured)) +
  geom_bar(aes(fill = previously_insured),
           show.legend = FALSE) +
  labs(x = "Previously Insured", y = "#", 
       title = "Customers \nPreviously Insured") +
  theme_minimal()

vehicle_age_plt <- cat_attributes %>% 
  ggplot(aes(x = vehicle_age)) +
  geom_bar(aes(fill = vehicle_age),
           show.legend = FALSE) +
  labs(x = "vehicle_age", y = "#", 
       title = "Customers \nVehicle Age") +
  theme_minimal()

vehicle_damage_plt <- cat_attributes %>% 
  ggplot(aes(x = vehicle_damage)) +
  geom_bar(aes(fill = vehicle_damage),
           show.legend = FALSE) +
  labs(x = "vehicle_damage", y = "#", 
       title = "Customers \nVehicle Damage") +
  theme_minimal()

policy_sales_channel_plt <- cat_attributes %>% 
  ggplot(aes(x = policy_sales_channel)) +
  geom_bar(aes(fill = factor(policy_sales_channel)),
           show.legend = FALSE) +
  labs(x = "policy_sales_channel", y = "#", 
       title = "Customers \nPolicy Sales Channel") +
  theme_minimal()

response_plt <- cat_attributes %>% 
  ggplot(aes(x = response)) +
  geom_bar(aes(fill = response),
           show.legend = FALSE) +
  labs(x = "response", y = "#", 
       title = "Customers response") +
  theme_minimal()       
   
gridExtra::grid.arrange(gender_plt, driving_license_plt,
                        region_code_plt, previously_insured_plt,
                        vehicle_age_plt, vehicle_damage_plt, 
                        policy_sales_channel_plt, response_plt,
                        ncol = 2, nrow = 4)

6 Hypothesis validation

# Read data_cleaned
df_cleaned <- readRDS("df_cleaned.rds")

6.1 H1) Customers with HIGHER AGE are more likely to be interested in the car insurance. ✅

# Boxplot
age_boxplot <- df_cleaned %>% 
  ggplot(aes(x = response, y = age)) +
  stat_boxplot(geom = 'errorbar', width = 0.6) + 
  geom_boxplot(aes(fill = response), show.legend = FALSE) + 
  labs(title = "Age vs. Response Comparison", y = "Age", x = "Response") +
  theme_bw()
 
ggsave("img/age_boxplot.jpg", plot = age_boxplot, width = 6, 
       height = 4) 



# Histogram 
age_plot <- df_cleaned %>% 
  ggplot(aes(x = age)) +
  geom_histogram(binwidth = 1, color = "gray", fill="navy") +
  facet_wrap(vars(response), nrow = 2, scales = "free_y") +
  labs(y = "Number of clients") + 
  ggtitle("Age distribution")

age_plot

ggsave("img/age_plot.jpg", plot = age_plot, width = 6, 
       height = 4) 
# Descriptive Statistics
df_cleaned %>% 
  select(age, response) %>% 
  tbl_summary(by = response) %>% 
  add_p()
Characteristic yes, N = 46,7101 no, N = 334,3991 p-value2
age 43 (35, 51) 34 (24, 49) <0.001
1 Median (IQR)
2 Wilcoxon rank sum test

Young people seems to be less interested in the car insurance. The median age for intereseted customers is 43 years (IQR: 35, 51), while the median for non-interested customers is 34 years (IQR: 24, 49).

6.2 H2) Women are likely to be interested in the car insurance.

gender_plot <- df_cleaned %>% 
  select(response, gender) %>% 
  ggplot(aes(x = response)) +
  geom_bar(aes(fill = gender), position = 'dodge') +
  labs(title = "Gender vs. Response", 
       x = "Response", y = "Number of customers") +
  theme_bw()

ggsave("img/gender_plot.png", plot = gender_plot)
df_cleaned %>% 
  select(response, gender) %>% 
  tbl_summary(by = response) %>% 
  add_p()
Characteristic yes, N = 46,7101 no, N = 334,3991 p-value2
gender <0.001
    female 18,185 (39%) 156,835 (47%)
    male 28,525 (61%) 177,564 (53%)
1 n (%)
2 Pearson’s Chi-squared test

For customers interested in the car insurance, 61% were men, and 39% were women. So, this hypothesis is FALSE. Although, gender and response are statistically significant, i. e., are related.

6.3 H3) Customers having newer cars are more likely to be interested in the car insurance.

car_age_plot <- df_cleaned %>% 
  select(response, vehicle_age) %>% 
  ggplot(aes(x = response)) +
  geom_bar(aes(fill = vehicle_age), position = 'dodge') +
  labs(title = "Vehicle Age vs. Response", 
       x = "Response", y = "Number of customers") +
  theme_bw()

ggsave("img/car_age_plot.jpg", plot = car_age_plot)
df_cleaned %>% 
  select(response, vehicle_age) %>% 
  tbl_summary(by = response) %>% 
  add_p()
Characteristic yes, N = 46,7101 no, N = 334,3991 p-value2
vehicle_age <0.001
    below_1_year 7,202 (15%) 157,584 (47%)
    between_1_2_years 34,806 (75%) 165,510 (49%)
    over_2_years 4,702 (10%) 11,305 (3.4%)
1 n (%)
2 Pearson’s Chi-squared test

Customers with cars aged between 1 and 2 years are more likely to be interested in the car insurance (75%). While, only 15% of the interested customers have newer cars.

6.4 H4) Customers with previous car damage are more likely to accept the car insurance ✅

car_damage_plot <- df_cleaned %>% 
  select(response, vehicle_damage) %>% 
  ggplot(aes(x = response)) +
  geom_bar(aes(fill = vehicle_damage), position = 'dodge') +
  labs(title = "Vehicle Damage vs. Response", 
       x = "Response", y = "Number of customers") +
  theme_bw()

car_damage_plot

ggsave("img/car_damage_plot.jpg", plot = car_damage_plot)
df_cleaned %>% 
  select(response, vehicle_damage) %>% 
  tbl_summary(by = response) %>% 
  add_p()
Characteristic yes, N = 46,7101 no, N = 334,3991 p-value2
vehicle_damage 45,728 (98%) 146,685 (44%) <0.001
1 n (%)
2 Pearson’s Chi-squared test

Customers with previous car damage are likely to be interested in the car insurance, as 98% said yes.

6.5 H5) Customers with previous car insurance are more likely to accept the car insurance ❌

car_insurance_plot <- df_cleaned %>% 
  select(response, previously_insured) %>% 
  ggplot(aes(x = response)) +
  geom_bar(aes(fill = previously_insured), position = 'dodge') +
  labs(title = "Vehicle Previously Insured vs. Response", 
       x = "Response", y = "Number of customers") +
  theme_bw()
car_insurance_plot

ggsave("img/car_insurance_plot.jpg", plot = car_insurance_plot)
df_cleaned %>% 
  select(response, previously_insured) %>% 
  tbl_summary(by = response) %>% 
  add_p()
Characteristic yes, N = 46,7101 no, N = 334,3991 p-value2
previously_insured 158 (0.3%) 174,470 (52%) <0.001
1 n (%)
2 Pearson’s Chi-squared test

Only 0.3% of customers interested in the car insurance have the car previously insured.

6.6 H6) Interest is greater in customers with HIGHER ANNUAL HEALTH expenses.

# Boxplot
expenses_boxplot <- df_cleaned %>% 
  ggplot(aes(x = response, y = health_annual_paid)) +
  stat_boxplot(geom = 'errorbar', width = 0.6) + 
  geom_boxplot(aes(fill = response), show.legend = FALSE) + 
  labs(title = "Annual Payment vs. Response Comparison", 
       y = "Annual Payment", x = "Response") +
  theme_bw()
 
ggsave("img/expenses_boxplot.jpg", 
       plot = expenses_boxplot, width = 6, 
       height = 4) 

# Histogram 
expenses_plot <- df_cleaned %>% 
  ggplot(aes(x = health_annual_paid)) +
  geom_histogram(binwidth = 10000, color = "gray", fill="navy") +
  facet_wrap(vars(response), nrow = 2, scales = "free_y") +
  labs(y = "Number of clients") + 
  ggtitle("Expenses distribution")

expenses_plot

ggsave("img/expenses_plot.jpg", plot = expenses_plot, 
       width = 6, 
       height = 4) 
# Descriptive Statistics
df_cleaned %>% 
  select(response, health_annual_paid) %>% 
  tbl_summary(by = response) %>% 
  add_p()
Characteristic yes, N = 46,7101 no, N = 334,3991 p-value2
health_annual_paid 33,002 (24,868, 41,297) 31,504 (24,351, 39,120) <0.001
1 Median (IQR)
2 Wilcoxon rank sum test

Although the health annual paid showed to be significant, we consider this hypotheses FALSE, and will further investigate the outliers.

6.7 H7) Customers with health insurance for MORE TIME are more likely to be interested in car insurance

# Boxplot
days_boxplot <- df_cleaned %>% 
  ggplot(aes(x = response, y = days_associated)) +
  stat_boxplot(geom = 'errorbar', width = 0.6) + 
  geom_boxplot(aes(fill = response), show.legend = FALSE) + 
  labs(title = "Days associated vs. Response Comparison", 
       y = "Days", x = "Response") +
  theme_bw()
 
ggsave("img/days_boxplot.jpg", 
       plot = days_boxplot, width = 6, 
       height = 4) 

# Histogram 
days_plot <- df_cleaned %>% 
  ggplot(aes(x = days_associated)) +
  geom_histogram(binwidth = 30, color = "gray", fill="navy") +
  facet_wrap(vars(response), nrow = 2, scales = "free_y") +
  labs(y = "Number of clients") + 
  ggtitle("Days associated distribution")

days_plot

ggsave("img/days_plot.jpg", plot = days_plot, 
       width = 6, 
       height = 4) 
df_cleaned %>% 
  select(response, days_associated) %>% 
  tbl_summary(by = response) %>% 
  add_p()
Characteristic yes, N = 46,7101 no, N = 334,3991 p-value2
days_associated 154 (82, 226) 154 (82, 227) 0.5
1 Median (IQR)
2 Wilcoxon rank sum test

This hypothesis is FALSE, basically the interested customers and non-interested customers have the same amount of days associated. Yes (median: 154 days, IQR: 82, 226); No (154 days, IQR: 82, 227).

6.8 Hypothesis conclusion

hypothesis <- c(
  "H1) Customers with HIGHER AGE are more likely to be interested in the car insurance.", 
  "H2) Women are likely to be interested in the car insurance.",
  "H3) Customers having newer cars are more likely to be interested in the car insurance.",
  "H4) Customers with previous car damage are more likely to accept the car insurance",
  "H5) Customers with previous car insurance are more likely to accept the car insurance",
  "H6) Interest is greater in customers with HIGHER ANNUAL HEALTH expenses.",
  "H7) Customers with health insurance for MORE TIME are more likely to be interested in car insurance "
  )

conclusion <- c(
  "True", 
  "False",
  "False",
  "True",
  "False",
  "False",
  "False"
)

relevance <- c(
  "High", 
  "Medium",
  "High",
  "High",
  "High",
  "Low",
  "Low"
)

hypothesis_table <- tibble(
  hypothesis = hypothesis,
  conclusion = conclusion,
  relevance = relevance
)

kable(hypothesis_table, format = "html") %>% 
  kableExtra::kable_styling(bootstrap_options = "striped",
                            full_width = FALSE)
hypothesis conclusion relevance
H1) Customers with HIGHER AGE are more likely to be interested in the car insurance. True High
H2) Women are likely to be interested in the car insurance. False Medium
H3) Customers having newer cars are more likely to be interested in the car insurance. False High
H4) Customers with previous car damage are more likely to accept the car insurance True High
H5) Customers with previous car insurance are more likely to accept the car insurance False High
H6) Interest is greater in customers with HIGHER ANNUAL HEALTH expenses. False Low
H7) Customers with health insurance for MORE TIME are more likely to be interested in car insurance False Low

7 Multivariable analysis

Correlation Matrix

correlation_matrix <- df_cleaned %>% 
  select(age, days_associated, health_annual_paid) %>% 
  cor() %>% 
  round(3)
  • Visualization:
melted_correlation <- melt(correlation_matrix)

matrix_correlation_plot <- ggplot(melted_correlation, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile() +
  geom_text(aes(Var1, Var2, label = value)) +
  scale_fill_gradient2(low = "blue", high = "red",
                       limit = c(-1, 1), name = "Correlation") +
  theme(axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        panel.background = element_blank())

matrix_correlation_plot

ggsave("img/matrix_correlation_plot.jpg", matrix_correlation_plot,
       width = 8, height = 4)

There is no strong correlation among numerical variables.

8 Data Preparation

  • Frequency encoding for policy_sales_channel | Build Function ✅

  • Target encoding for gender e region_code | Build Function ✅

8.1 Target encoding

# For gender
gender_encoder <- df_cleaned %>% 
  mutate(response_num = ifelse(response == "yes", 1, 0)) %>% 
  group_by(gender) %>% 
  summarise(gender_num = mean(response_num, na.rm = TRUE))
  
# For region_code  
region_encoder <- df_cleaned %>% 
  mutate(response_num = ifelse(response == "yes", 1, 0)) %>% 
  group_by(region_code) %>% 
  summarise(region_num = mean(response_num, na.rm = TRUE))

# Save in rds
saveRDS(gender_encoder, "gender_encoder.rds")
saveRDS(region_encoder, "region_encoder.rds")

8.2 Frequency encoding

policy_encoder <- df_cleaned %>% 
  group_by(policy_sales_channel) %>% 
  summarise(policy_num = n()/nrow(df_cleaned))

# Save in rds
saveRDS(policy_encoder, "policy_encoder.rds")
  • Using the created encoders in our dataset:

    # Create function
    encoder_function <- function(df){
      df %>% 
      left_join(gender_encoder) %>% 
      select(-gender) %>% 
      rename(gender = gender_num) %>% 
      left_join(region_encoder) %>% 
      select(-region_code) %>% 
      rename(region_code = region_num) %>% 
      left_join(policy_encoder) %>% 
      select(-policy_sales_channel) %>% 
      rename(policy_sales_channel = policy_num) 
    }
    # Save function in rds
    saveRDS(encoder_function, "encoder_function.rds")

    Splitting into train and test datasets

    encoder_function <- readRDS("encoder_function.rds")
    
    df_preprocessed <- encoder_function(df_cleaned)
    set.seed(123)
    
    df_split <- df_preprocessed %>% 
      initial_split(prop = 0.75, strata = response)
    
    df_train <- df_split %>% 
      training()
    
    df_test <- df_split %>% 
      testing()
  • Check response proportions:

    df_train %>% 
      select(response) %>% 
      tbl_summary(type = list(response ~ "categorical"))
    Characteristic N = 285,8311
    response
        yes 35,032 (12%)
        no 250,799 (88%)
    1 n (%)
    df_test %>% 
      select(response) %>% 
      tbl_summary(type = list(response ~ "categorical"))
    Characteristic N = 95,2781
    response
        yes 11,678 (12%)
        no 83,600 (88%)
    1 n (%)
  • Using tidymodels steps to continue the preprocessing:

df_recipe <- recipe(response ~.,
       data = df_train) %>% 
  step_normalize(age, days_associated) %>% 
  step_scale(health_annual_paid) %>% 
  step_dummy(all_nominal(), -all_outcomes())
  • Applying the recipe:
# Train the recipe
df_train_prep <- df_recipe %>% 
  prep(training = df_train)

df_train_processed <- df_train_prep %>% 
  bake(new_data = df_train) 
# For the test dataset 
df_test_processed <- df_train_prep %>% 
  bake(new_data = df_test) 

9 Feature selection

predictors <- df_train_processed %>% 
  select(-id, -response)

target <- df_train_processed$response

# Check raw numbers
length(target)
## [1] 285831
nrow(predictors) 
## [1] 285831
start_time <- Sys.time()

rf_model <- randomForest(predictors, target, ntree = 10,
                         importance = TRUE)

end_time <- Sys.time()

print(end_time - start_time)
## Time difference of 10.72002 secs
  • Show results:
importance(rf_model) %>% View()
varImpPlot(rf_model)

In this first cycle we are going to select the seven most important variables according to the Mean Decrease Gini:

importance(rf_model) %>% 
  as_tibble(rownames = "rowname") %>% 
  arrange(desc(MeanDecreaseGini)) %>% 
  select(rowname) %>% 
  slice(1:7) %>% 
  pull()
## [1] "vehicle_damage_no"     "days_associated"       "age"                  
## [4] "health_annual_paid"    "previously_insured_no" "policy_sales_channel" 
## [7] "region_code"
selected_columns <- c(
  "age",
  "vehicle_damage",
  "days_associated",
  "previously_insured",
  "health_annual_paid", 
  "policy_sales_channel", 
  "region_code",
  "response"
)
# Final dataset
df_selected <- df_cleaned %>% 
  select(all_of(selected_columns)) %>% names()

saveRDS(df_selected, "df_selected.rds")