1 Imports

library(tidyverse)
library(readr)
library(skimr)
library(janitor)
library(moments)
library(summarytools)
library(gridExtra)
library(gtsummary)
library(tidymodels)

2 Data Collection

Os dados podem ser encontrados no Kaggle

df <- read_csv("datasets/archive/train.csv")
glimpse(df)

## Rows: 381,109
## Columns: 12
## $ id                   <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
## $ Gender               <chr> "Male", "Male", "Male", "Male", "Female", "Female…
## $ Age                  <dbl> 44, 76, 47, 21, 29, 24, 23, 56, 24, 32, 47, 24, 4…
## $ Driving_License      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ Region_Code          <dbl> 28, 3, 28, 11, 41, 33, 11, 28, 3, 6, 35, 50, 15, …
## $ Previously_Insured   <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0…
## $ Vehicle_Age          <chr> "> 2 Years", "1-2 Year", "> 2 Years", "< 1 Year",…
## $ Vehicle_Damage       <chr> "Yes", "No", "Yes", "No", "No", "Yes", "Yes", "Ye…
## $ Annual_Premium       <dbl> 40454, 33536, 38294, 28619, 27496, 2630, 23367, 3…
## $ Policy_Sales_Channel <dbl> 26, 26, 26, 152, 152, 160, 152, 26, 152, 152, 124…
## $ Vintage              <dbl> 217, 183, 27, 203, 39, 176, 249, 72, 28, 80, 46, …
## $ Response             <dbl> 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0…

3 Data Description

3.1 Clean and Rename Columns

Deixar todos os nomes de variáveis snake_case.
Renomar vintage para days_associated

df1 <- janitor::clean_names(df) %>% 
  rename(days_associated = vintage)
names(df1)

##  [1] "id"                   "gender"               "age"                 
##  [4] "driving_license"      "region_code"          "previously_insured"  
##  [7] "vehicle_age"          "vehicle_damage"       "annual_premium"      
## [10] "policy_sales_channel" "days_associated"      "response"

3.2 Data Types

str(df1)

## spc_tbl_ [381,109 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ id                  : num [1:381109] 1 2 3 4 5 6 7 8 9 10 ...
##  $ gender              : chr [1:381109] "Male" "Male" "Male" "Male" ...
##  $ age                 : num [1:381109] 44 76 47 21 29 24 23 56 24 32 ...
##  $ driving_license     : num [1:381109] 1 1 1 1 1 1 1 1 1 1 ...
##  $ region_code         : num [1:381109] 28 3 28 11 41 33 11 28 3 6 ...
##  $ previously_insured  : num [1:381109] 0 0 0 1 1 0 0 0 1 1 ...
##  $ vehicle_age         : chr [1:381109] "> 2 Years" "1-2 Year" "> 2 Years" "< 1 Year" ...
##  $ vehicle_damage      : chr [1:381109] "Yes" "No" "Yes" "No" ...
##  $ annual_premium      : num [1:381109] 40454 33536 38294 28619 27496 ...
##  $ policy_sales_channel: num [1:381109] 26 26 26 152 152 160 152 26 152 152 ...
##  $ days_associated     : num [1:381109] 217 183 27 203 39 176 249 72 28 80 ...
##  $ response            : num [1:381109] 1 0 1 0 0 0 0 1 0 0 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   id = col_double(),
##   ..   Gender = col_character(),
##   ..   Age = col_double(),
##   ..   Driving_License = col_double(),
##   ..   Region_Code = col_double(),
##   ..   Previously_Insured = col_double(),
##   ..   Vehicle_Age = col_character(),
##   ..   Vehicle_Damage = col_character(),
##   ..   Annual_Premium = col_double(),
##   ..   Policy_Sales_Channel = col_double(),
##   ..   Vintage = col_double(),
##   ..   Response = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

Ou, de maneira mais organizada numa tibble:

class_vec <- unlist(lapply(df1, class))
names_vc <- names(df1)
data_types <- tibble(variable = names_vc, type = class_vec)
data_types

Tansformações necessárias:

Todas as variáveis do tipo character devem ser transformadas para factor.
As variáveis que estão na class numeric podem permanecer como estão, independente de serem float ou integer.
Vamos converter variáveis 0 e 1 em no e yes, respectivamente.
As variáveis que são categóricas terão suas categorias em letras caixa baixa para padronizar.
A variável resposta precisa ter a categoria yes como primeiro nível do fator.
Renomear as categorias da variável vehicle_age

df2 <- df1 %>% 
  mutate(
    across(where(is.character), tolower),
         driving_license = ifelse(driving_license == 1, "yes", "no"),
    previously_insured = ifelse(previously_insured == 1, "yes", "no"),
    response = ifelse(response == 1, "yes", "no"),
    vehicle_age = case_when(
      vehicle_age == '> 2 years' ~ 'over_2_years',
      vehicle_age == '1-2 year' ~ 'between_1_2_years',
      vehicle_age == '< 1 year' ~ 'below_1_year'
      )
    ) %>% 
  mutate_if(is.character, as.factor)

# Check levels of response variables
df2$response %>% 
  levels()

## [1] "no"  "yes"

# Correct order
df2 <- df2 %>% 
  mutate(response = factor(response, levels = c("yes", "no")))

3.3 Columns Description

Column	Meaning
id	unique customer identifier.
gender	client gender: Male / Female.
age	customer age.
region_code	customer region code.
policy_sales_channel	Anonymized Code for the channel of outreaching to the customer. Ie: Over Mail, Over Phone, In Person, etc..
driving_license	0 = customer does not have DL, 1 = already have DL
vehicle_age	vehicle age: < 1 Year, 1-2 Year, > 2 Years
vehicle_damage	No = customer has never had their vehicle damaged in the past, Yes = has had it.
vehicle_prev_insured	0 = customer does not have vehicle insurance, 1 = already has vehicle insurance.
health_annual_premium	annual amount paid by the customer to the company for health insurance. Currency: Rs(Pakistani rupee, R$1.00 = ± Rs0.03).
days_associated	number of days since the customer joined the company by purchasing health insurance. The policy is annual.
response	0 = customer is not interested, 1 = customer is interested.

3.4 Data info

skim(df2)

Data summary
Name	df2
Number of rows	381109
Number of columns	12
_______________________
Column type frequency:
factor	6
numeric	6
________________________
Group variables	None

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
gender	1	FALSE	2	mal: 206089, fem: 175020
driving_license	1	FALSE	2	yes: 380297, no: 812
previously_insured	1	FALSE	2	no: 206481, yes: 174628
vehicle_age	1	FALSE	3	bet: 200316, bel: 164786, ove: 16007
vehicle_damage	1	FALSE	2	yes: 192413, no: 188696
response	1	FALSE	2	no: 334399, yes: 46710

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
id	1	190555.00	110016.84	1	95278	190555	285832	381109	▇▇▇▇▇
age	1	38.82	15.51	20	25	36	49	85	▇▃▃▂▁
region_code	1	26.39	13.23	0	15	28	35	52	▃▂▇▃▃
annual_premium	1	30564.39	17213.16	2630	24405	31669	39400	540165	▇▁▁▁▁
policy_sales_channel	1	112.03	54.20	1	29	133	152	163	▅▁▁▃▇
days_associated	1	154.35	83.67	10	82	154	227	299	▇▇▇▇▇

Não temos valores NA, então, não precisamos nos preocupar com imputação de valores faltantes.

3.5 Descriptive Statistics

3.5.1 General

df2 %>% 
  select(-id) %>% 
  tbl_summary(digits = list(all_categorical() ~ c(0, 2)))

Characteristic	N = 381,109¹
gender
female	175,020 (45.92%)
male	206,089 (54.08%)
age	36 (25, 49)
driving_license	380,297 (99.79%)
region_code	28 (15, 35)
previously_insured	174,628 (45.82%)
vehicle_age
below_1_year	164,786 (43.24%)
between_1_2_years	200,316 (52.56%)
over_2_years	16,007 (4.20%)
vehicle_damage	192,413 (50.49%)
annual_premium	31,669 (24,405, 39,400)
policy_sales_channel	133 (29, 152)
days_associated	154 (82, 227)
response	46,710 (12.26%)
¹ n (%); Median (IQR)

Se quiser ver as categorias explícitas “yes”/“no” temos que especificar as variáveis como categóricas.

df2 %>% 
  select(-id) %>% 
  tbl_summary(type = list(response ~ "categorical",
                          driving_license ~ "categorical",
                          previously_insured ~ "categorical",
                          vehicle_damage ~ "categorical"),
              digits = list(all_categorical() ~ c(0, 2)))

Characteristic	N = 381,109¹
gender
female	175,020 (45.92%)
male	206,089 (54.08%)
age	36 (25, 49)
driving_license
no	812 (0.21%)
yes	380,297 (99.79%)
region_code	28 (15, 35)
previously_insured
no	206,481 (54.18%)
yes	174,628 (45.82%)
vehicle_age
below_1_year	164,786 (43.24%)
between_1_2_years	200,316 (52.56%)
over_2_years	16,007 (4.20%)
vehicle_damage
no	188,696 (49.51%)
yes	192,413 (50.49%)
annual_premium	31,669 (24,405, 39,400)
policy_sales_channel	133 (29, 152)
days_associated	154 (82, 227)
response
yes	46,710 (12.26%)
no	334,399 (87.74%)
¹ n (%); Median (IQR)

3.5.2 More detailed statistics for numeric variables

num_attributes = df2 %>% 
  select(age, annual_premium, days_associated)
descriptive_table <- descr(num_attributes, style = "rmarkdown")
kable(data.frame(descriptive_table), format = "html") %>%
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

	age	annual_premium	days_associated
Mean	3.882258e+01	3.056439e+04	1.543474e+02
Std.Dev	1.551161e+01	1.721316e+04	8.367130e+01
Min	2.000000e+01	2.630000e+03	1.000000e+01
Q1	2.500000e+01	2.440500e+04	8.200000e+01
Median	3.600000e+01	3.166900e+04	1.540000e+02
Q3	4.900000e+01	3.940000e+04	2.270000e+02
Max	8.500000e+01	5.401650e+05	2.990000e+02
MAD	1.779120e+01	1.112543e+04	1.082298e+02
IQR	2.400000e+01	1.499500e+04	1.450000e+02
CV	3.995512e-01	5.631768e-01	5.420973e-01
Skewness	6.725337e-01	1.766073e+00	3.029500e-03
SE.Skewness	3.967800e-03	3.967800e-03	3.967800e-03
Kurtosis	-5.656762e-01	3.400391e+01	-1.200697e+00
N.Valid	3.811090e+05	3.811090e+05	3.811090e+05
Pct.Valid	1.000000e+02	1.000000e+02	1.000000e+02

3.5.3 Visualization

Numeric

age_plt <- df2 %>%  
  ggplot(aes(x = age)) +
  geom_histogram(aes(y=after_stat(density)), binwidth = 1, color = "gray", fill = "lightblue", 
                 alpha = 0.5) +
  geom_density(color = "blue") +
  xlab('Age') +
  ylab('Customers by Age (density)') +
  ggtitle('Customers by Age')

paid_plt <- df2 %>%  
  ggplot(aes(x = annual_premium)) +
  geom_histogram(aes(y=after_stat(density)), binwidth = 5000, color = "gray", fill = "lightblue", 
                 alpha = 0.5) +
  geom_density(color = "blue") +
  xlab('Health Annual Paid') +
  ylab('Customers by Health Annual Premium (density)') +
  ggtitle('Customers by Health Annual Paid')

days_plt <- df2 %>%  
  ggplot(aes(x = days_associated)) +
  geom_histogram(aes(y=after_stat(density)), binwidth = 5, color = "gray", fill = "lightblue", 
                 alpha = 0.5) +
  geom_density(color = "blue") +
  xlab('Days Associated') +
  ylab('Customers by Days Associated (density)') +
  ggtitle('Customers by Days Associated')

grid.arrange(age_plt, paid_plt, days_plt, ncol = 3)

Categorical

# variables we don't want to add to cat_attributes
var_exc <- num_attributes %>% names()

cat_attributes = df2 %>% 
  select(-one_of(var_exc), -id)

gender_plt <- cat_attributes %>%  
  ggplot(aes(x = gender)) +
  geom_bar(aes(fill = gender), show.legend = FALSE) 

region_plt <- cat_attributes %>%  
  ggplot(aes(x = factor(region_code))) +
  geom_bar(aes(fill = factor(region_code)), show.legend = FALSE) +
  xlab("region_code")

policy_plt <- cat_attributes %>%  
  ggplot(aes(x = factor(policy_sales_channel))) +
  geom_bar(aes(fill = factor(policy_sales_channel)), show.legend = FALSE) +
  xlab("policy_sales_channel")

license_plt <- cat_attributes %>%  
  ggplot(aes(x = driving_license)) +
  geom_bar(aes(fill = driving_license), show.legend = FALSE) 

vehicle_age_plt <- cat_attributes %>%  
  ggplot(aes(x = vehicle_age)) +
  geom_bar(aes(fill = vehicle_age), show.legend = FALSE) 

damage_plt <- cat_attributes %>%  
  ggplot(aes(x = vehicle_damage)) +
  geom_bar(aes(fill = vehicle_damage), show.legend = FALSE) 


previous_plt <- cat_attributes %>%  
  ggplot(aes(x = previously_insured)) +
  geom_bar(aes(fill = previously_insured), show.legend = FALSE) 

response_plt <- cat_attributes %>%  
  ggplot(aes(x = response)) +
  geom_bar(aes(fill = response), show.legend = FALSE) 

grid.arrange(gender_plt, region_plt, policy_plt, license_plt,
             vehicle_age_plt, damage_plt, previous_plt,
             response_plt, ncol = 2, nrow = 4)

4 Hypothesis creation

htmltools::img(src = knitr::image_uri("img/mind_map_hypothesis.png"), width = "500", heigth = "500")

Hypotheses Customers:

H1) Interest is greater in customers with HIGHER AGE.
H2) Interest is greater in FEMALE CUSTOMERS.

Hypotheses Vehicle Insurance:

H3) Interest is greater in customers WITH MOST RECENT VEHICLE AGE.
H4) Interest is greater in customers who HAD VEHICLE DAMAGE.
H5) Interest is greater in Customers who HAD VEHICLE PREVIOUSLY INSURED.

Hypotheses Health Insurance:

H6) Interest is greater in customers with HIGHER ANNUAL HEALTH EXPANSES.
H7) Interest is greater in customers whth MORE DAYS ASSOCIATED.

Hypotheses Offer Methods: (no data available)

Hypotheses Vehicle: (no data available)

Hypotheses Seasonality: (no data available)

Prioritization of Hypotheses: all can be validated on first crisp cycle.

5 Exploratory Data Analysis

5.1 Univariate Analysis

Done before.

5.2 Bivariate Analysis

H1) Interest is greater in customers with HIGHER AGE. ✅

df2 %>% 
  ggplot(aes(x = response, y = age)) +
  stat_boxplot(geom ='errorbar', width = 0.6) + 
  geom_boxplot(aes(fill = response), show.legend = FALSE)

# add a statistical test
df2 %>% 
  select(age, response) %>% 
  tbl_summary(by = response) %>% 
  add_p()

Characteristic	yes, N = 46,710¹	no, N = 334,399¹	p-value²
age	43 (35, 51)	34 (24, 49)	<0.001
¹ Median (IQR)
² Wilcoxon rank sum test

# let's take a look on the age distribution for each group
df2 %>% 
  ggplot(aes(age)) +
  geom_histogram(binwidth = 1, color = "gray", fill = "navyblue") +
  facet_grid(rows = vars(response)) +
  ggtitle("Age distribution")

It seems that age has a strong impact on the response. Age distribution is different for interested and not interested customer. The statistical test confirms age is a significant factor for response (p < 0.001). Interested customers are, on average, older than no interested customers. So, we can say this hypothesis is TRUE, interested customers are older.

H2) Interest is greater in FEMALE CUSTOMERS. ❌

df2 %>% 
  ggplot(aes(x = gender)) +
  geom_bar(aes(fill = gender), show.legend = FALSE)

df2 %>% 
  ggplot(aes(x = factor(response))) +
  geom_bar(aes(fill = gender), position = 'dodge')

df2 %>% 
  select(gender) %>% 
  tbl_summary()

Characteristic	N = 381,109¹
gender
female	175,020 (46%)
male	206,089 (54%)
¹ n (%)

df2 %>% 
  select(response, gender) %>% 
  tbl_summary(by = response) %>% 
  add_p()

Characteristic	yes, N = 46,710¹	no, N = 334,399¹	p-value²
gender			<0.001
female	18,185 (39%)	156,835 (47%)
male	28,525 (61%)	177,564 (53%)
¹ n (%)
² Pearson’s Chi-squared test

61% of interested customers are male. This hypothesis is FALSE, although, Gender is a statistical significant variable for response (p < 0.001).

H3) Interest is greater in customers WITH MOST RECENT VEHICLE AGE. ❌

df2 %>% 
  ggplot(aes(x = factor(response))) +
  geom_bar(aes(fill = vehicle_age), position = 'dodge')

df2 %>% 
  select(response, vehicle_age) %>% 
  tbl_summary(by = response) %>% 
  add_p()

Characteristic	yes, N = 46,710¹	no, N = 334,399¹	p-value²
vehicle_age			<0.001
below_1_year	7,202 (15%)	157,584 (47%)
between_1_2_years	34,806 (75%)	165,510 (49%)
over_2_years	4,702 (10%)	11,305 (3.4%)
¹ n (%)
² Pearson’s Chi-squared test

The most interested customers are the ones with car age between 1 and 2 years (49%). So, this hypothesis is FALSE. Vehicle age has shown to be a significant variable for response.

H4) Interest is greater in customers who HAD VEHICLE DAMAGE. ✅

df2 %>% 
  ggplot(aes(x = response)) +
  geom_bar(aes(fill = vehicle_damage), 
               position = 'dodge')

df2 %>% 
  select(response, vehicle_damage) %>% 
  tbl_summary(by = response) %>% 
  add_p()

Characteristic	yes, N = 46,710¹	no, N = 334,399¹	p-value²
vehicle_damage	45,728 (98%)	146,685 (44%)	<0.001
¹ n (%)
² Pearson’s Chi-squared test

98% of interested customer had vehicle damage. So, this hypothesis is TRUE.

H5) Interest is greater in Customers who HAD VEHICLE PREVIOUSLY INSURED. ❌

df2 %>% 
  ggplot(aes(x = factor(response))) +
  geom_bar(aes(fill = factor(previously_insured)), 
           position = 'dodge')

df2 %>% 
  select(response, previously_insured) %>% 
  mutate(previously_insured = factor(previously_insured)) %>% 
  tbl_summary(by = response,
              digits = list(all_categorical() ~ c(0, 2))) %>% 
  add_p()

Characteristic	yes, N = 46,710¹	no, N = 334,399¹	p-value²
previously_insured	158 (0.34%)	174,470 (52.17%)	<0.001
¹ n (%)
² Pearson’s Chi-squared test

This hypothesis is FALSE. Only 0.34% of customers interested in insurance had vehicle previously insured.

H6) Interest is greater in customers with HIGHER ANNUAL HEALTH EXPANSES. ❌

df2 %>% 
  ggplot(aes(x = factor(response), y = annual_premium)) +
  geom_boxplot(aes(color = factor(response)), show.legend = FALSE)

df2 %>% 
  select(response, annual_premium) %>% 
  tbl_summary(by = response,
              digits = list(all_categorical() ~ c(0, 2))) %>% 
  add_p()

Characteristic	yes, N = 46,710¹	no, N = 334,399¹	p-value²
annual_premium	33,002 (24,868, 41,297)	31,504 (24,351, 39,120)	<0.001
¹ Median (IQR)
² Wilcoxon rank sum test

# Checking annual_premium for each response group
df2 %>% 
  ggplot(aes(annual_premium)) +
  geom_histogram(binwidth = 10000) +
  facet_grid(rows = vars(response))

FALSE. Customers interested in insurance don’t have HIGHER ANNUAL HEALTH EXPENSES. Actually, both groups seems to have, on average, the same expenses. There is a huge amount of outliers in this variable and an enourmous range, what have led to a significant p-value.

H7) Interest is greater in customers whth MORE DAYS ASSOCIATED. ❌

df2 %>% 
  ggplot(aes(x = factor(response), y = days_associated)) +
  geom_boxplot(aes(color = factor(response)), show.legend = FALSE)

df2 %>% 
  select(response, days_associated) %>% 
  tbl_summary(by = response,
              digits = list(all_categorical() ~ c(0, 2))) %>% 
  add_p()

Characteristic	yes, N = 46,710¹	no, N = 334,399¹	p-value²
days_associated	154 (82, 226)	154 (82, 227)	0.5
¹ Median (IQR)
² Wilcoxon rank sum test

# Checking days_associated for each response group
df2 %>% 
  ggplot(aes(days_associated)) +
  geom_histogram(binwidth = 10, color = "gray", fill = "lightblue") +
  facet_grid(rows = vars(response))

FALSE. There is no difference on the number of days associated between the interested and not interested customers groups.

5.3 Hypothesis conclusion

Hypothesis	Conclusion	Estimated_Relevance
H1 - Interest is greater in customers with HIGHER AGE.	True	Medium
H2 - Interest is greater in FEMALE CUSTOMERS.	False	Low
H3 - Interest is greater in customers WITH MOST RECENT VEHICLE AGE.	False	High
H4 - Interest is greater in customers who HAD VEHICLE DAMAGE.	True	High
H5 - Interest is greater in Customers who HAD VEHICLE PREVIOUSLY INSURED.	False	High
H6 - Interest is greater in customers with HIGHER ANNUAL HEALTH EXPANSES.	False	Low
H7 - Interest is greater in customers with MORE DAYS ASSOCIATED	False	Low

6 Feature Selection

Vamos usar Random Forest para seleção de variáveis por importância.

library(randomForest)

predictors <- df2 %>% select(gender:days_associated)
target <- df2$response     

start_time <- Sys.time()
rf_model <- randomForest(predictors, target, ntree = 10, 
                         importance = TRUE)
end_time <- Sys.time()
# Calculate the elapsed time
elapsed_time <- end_time - start_time
print(paste("Elapsed time in seconds:", round(elapsed_time, 2)))

## [1] "Elapsed time in seconds: 19.37"

# Access the variable importance scores
importance(rf_model)

##                              yes          no MeanDecreaseAccuracy
## gender                0.06869669   2.5298923            1.9084967
## age                  -2.82413657   5.5316309            6.1802037
## driving_license       0.90537135   0.4203814            0.8396680
## region_code          -4.13522654  12.8723633           15.4708946
## previously_insured   25.25440800 -16.4161763           -7.9808798
## vehicle_age          -6.84508567   8.5334384            9.5968620
## vehicle_damage       25.00425477 -17.5098838          -10.4359392
## annual_premium       -5.75777154   8.0191668            8.2737357
## policy_sales_channel  3.70009752   4.0266010            9.0407372
## days_associated      -0.43306515  -0.4084573           -0.5650863
##                      MeanDecreaseGini
## gender                      733.83820
## age                        6160.04285
## driving_license              38.67545
## region_code                3695.24498
## previously_insured         4702.67217
## vehicle_age                1567.51227
## vehicle_damage             5902.10013
## annual_premium             7438.30521
## policy_sales_channel       2864.53179
## days_associated            7771.61777

# Plot variable importance
varImpPlot(rf_model)

MeanDecreaseAccuracy: This column represents the mean decrease in accuracy caused by randomly permuting the values of the variable across the out-of-bag (OOB) samples. In other words, it measures the decrease in model accuracy when the values of a particular variable are randomly shuffled. Higher values indicate more important variables.
MeanDecreaseGini: This column represents the mean decrease in the Gini impurity caused by randomly permuting the values of the variable across the OOB samples. The Gini impurity measures the node impurity in a decision tree, and higher values indicate more important variables.

Nest primeiro ciclo do CRISP, vamos selecionar as 7 primeiras variáveis do gráfico de MeanDecreaseGini.

df_selected_feat <- df2 %>% 
  select(days_associated, annual_premium, previously_insured, age, vehicle_damage, region_code, policy_sales_channel, response)

7 Data Preparation

7.1 Splitting into training and test datasets

set.seed(1512)

health_split <- df_selected_feat %>% 
  initial_split(prop = 0.75,
                strata = response)

df_train <- health_split %>% 
  training()

df_test <- health_split %>% 
  testing()

7.1.1 Checking train and test sets

# Check number of rows in each dataset
print(paste0("#Rows Train: ", nrow(df_train)))

## [1] "#Rows Train: 285831"

print(paste0("#Rows Test: ", nrow(df_test)))

## [1] "#Rows Test: 95278"

# Distribution of response in training data
df_train %>% 
  select(response) %>% 
  tbl_summary(type = list(response ~ "categorical"))

Characteristic	N = 285,831¹
response
yes	35,032 (12%)
no	250,799 (88%)
¹ n (%)

# Distribution of response in test data
df_test %>% 
  select(response) %>% 
  tbl_summary(type = list(response ~ "categorical"))

Characteristic	N = 95,278¹
response
yes	11,678 (12%)
no	83,600 (88%)
¹ n (%)

Temos as mesmas proporções de “yes” e “no” nos conjuntos de treino e teste.

7.2 Preprocessing

Vamos usar uma recipe do tidymodels para preparar o pré-processamento das variáveis. Uma vez setada esta recipe, podemos aplicá-la aos dados de treino e teste.

Standardization: para variáveis numéricas com distribuição normal
Rescaling: para variáveis numéricas com distribuição não normal
Encoding: transformar variáveis categóricas em variáveis numérica.

Num primeiro ciclo, vamos fazer as seguintes transformações:

Eliminar variáveis altamente correlacionadas.
Normalizar todas as variáveis numéricas.
One-hot encoding para as categóricas.

7.2.1 Checando se existem variáveis numéricas altamente correlacionadas

df_train %>% 
  select_if(is.numeric) %>% 
  cor()

##                      days_associated annual_premium          age  region_code
## days_associated         1.0000000000   0.0001439562 -0.001303359 -0.002435694
## annual_premium          0.0001439562   1.0000000000  0.068682407 -0.010880993
## age                    -0.0013033590   0.0686824066  1.000000000  0.042007616
## region_code            -0.0024356939  -0.0108809934  0.042007616  1.000000000
## policy_sales_channel    0.0001014691  -0.1128024119 -0.578551159 -0.041932309
##                      policy_sales_channel
## days_associated              0.0001014691
## annual_premium              -0.1128024119
## age                         -0.5785511595
## region_code                 -0.0419323087
## policy_sales_channel         1.0000000000

Não há!

7.2.2 Criando os passos para o pré-processamento

df_recipe <- recipe(response ~.,
                    data = df_train) %>%
  step_normalize(all_numeric()) %>%
  step_dummy(all_nominal(), -all_outcomes())
df_recipe

# Train the recipe
df_train_prep <- df_recipe %>%
  prep(training = df_train)
df_train_prep

7.2.3 Apply recipe to training data

df_train_processed <- df_train_prep %>%
  bake(new_data = df_train)
df_train_processed %>% head()

7.2.4 Apply recipe to test data

df_test_processed <- df_train_prep %>%
  bake(new_data = df_test)
df_test_processed %>% head()

8 Models to be tested

Logistic Regression ✅
KNN ✅
Decision Tree ✅
Random Forest ✅
XGBoost ✅

8.1 Metrics

Accuracy, precision, recall and F-1 Score

In this very first moment we are going to fit the models and calculate a unique performance calculation. Afterwards, we are going to consider cross validation and hyperparameter tuning.

8.2 Logistic Regression Model

8.2.1 Model specification

logistic_model <- logistic_reg() %>% 
  set_engine('glm') %>% 
  set_mode('classification')

8.2.2 Model fitting

start_time <- Sys.time()

logistic_fit <- logistic_model %>% 
  fit(response ~ .,
      data = df_train)

end_time <- Sys.time()

# Calculate the elapsed time
elapsed_time <- end_time - start_time
print(paste0("Elapsed time to train a logsitc model: ", round(elapsed_time, 2), " seconds"))

## [1] "Elapsed time to train a logsitc model: 1.84 seconds"

8.2.3 Predict outcome categories

class_preds <- logistic_fit %>% 
  predict(new_data = df_test,
          type = 'class')
class_preds

8.2.4 Estimated probabilities

prob_preds <- logistic_fit %>% 
  predict(new_data = df_test,
          type = 'prob')
prob_preds

8.2.5 Combining results

insurance_results <- df_test %>% 
  select(response) %>% 
  bind_cols(class_preds, prob_preds)

insurance_results

8.2.6 Confusion matrix

conf_mat(insurance_results,
         truth = response,
         estimate = .pred_class)

##           Truth
## Prediction   yes    no
##        yes     0     0
##        no  11678 83600

8.2.7 Creating a metric set

custom_metrics <- metric_set(accuracy, precision, recall, f_meas)
lr_metrics <- custom_metrics(insurance_results,
         truth = response,
         estimate = .pred_class)
lr_metrics

# ROC Curve
insurance_results %>% 
  roc_curve(truth = response, .pred_yes) %>% 
  autoplot()

# Calculating AUC
roc_auc(insurance_results,
        truth = response,
        .pred_yes)

# Saving metrics results
lr_metrics <-  pivot_wider(lr_metrics, 
            names_from = .metric, 
            values_from = .estimate) %>% 
  mutate(model = 'logistic regression') %>% 
  select(model, accuracy:f_meas)
lr_metrics

8.3 KNN Model

More info: https://parsnip.tidymodels.org/reference/nearest_neighbor.html

8.3.1 Model specification

knn_model <- nearest_neighbor(weight_func = "rectangular", 
                              neighbors = 3) %>% 
  set_mode("classification") %>% 
  set_engine("kknn")

8.3.2 Model fitting

start_time <- Sys.time()

library(kknn)
knn_fit <- knn_model %>%
  fit(response ~ .,
      data = df_train)

end_time <- Sys.time()

# Calculate the elapsed time
elapsed_time <- end_time - start_time
print(paste0("Elapsed time to train a knn model: ", round(elapsed_time, 2), " seconds"))

## [1] "Elapsed time to train a knn model: 10.21 seconds"

# Predictions 
class_preds <- knn_fit %>% 
  predict(new_data = df_test,
          type = 'class')

# Estimated Probabilities
prob_preds <- knn_fit %>% 
  predict(new_data = df_test,
          type = 'prob')

# Combine results 
knn_results <- df_test %>% 
  select(response) %>% 
  bind_cols(class_preds, prob_preds)

# Confusion Matrix
conf_mat(knn_results,
         truth = response,
         estimate = .pred_class)

##           Truth
## Prediction   yes    no
##        yes  2468  5507
##        no   9210 78093

# Metrics
custom_metrics <- metric_set(accuracy, precision, recall, f_meas)
knn_metrics <- custom_metrics(knn_results,
         truth = response,
         estimate = .pred_class)

# Saving metrics results
knn_metrics <-  pivot_wider(knn_metrics, 
            names_from = .metric, 
            values_from = .estimate) %>% 
  mutate(model = 'KNN') %>% 
  select(model, accuracy:f_meas)

knn_metrics

8.4 Decision Tree

# Model Specification
dt_model <- decision_tree(tree_depth = 10) %>%
  set_engine("rpart") %>%
  set_mode("classification")

# Model fitting
start_time <- Sys.time()

dt_fit <- dt_model %>%
  fit(response ~ .,
      data = df_train)

end_time <- Sys.time()

# Calculate the elapsed time
elapsed_time <- end_time - start_time
print(paste0("Elapsed time to train a decision tree model: ", round(elapsed_time, 2)))

## [1] "Elapsed time to train a decision tree model: 0.69"

dt_fit

## parsnip model object
## 
## n= 285831 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 285831 35032 no (0.1225619 0.8774381) *

library(rpart.plot)
dt_fit %>%
  extract_fit_engine() %>%
  rpart.plot()

# Predictions 
class_preds <- dt_fit %>% 
  predict(new_data = df_test,
          type = 'class')

# Estimated Probabilities
prob_preds <- dt_fit %>% 
  predict(new_data = df_test,
          type = 'prob')

# Combine results 
dt_results <- df_test %>% 
  select(response) %>% 
  bind_cols(class_preds, prob_preds)

# Confusion Matrix
conf_mat(dt_results,
         truth = response,
         estimate = .pred_class)

##           Truth
## Prediction   yes    no
##        yes     0     0
##        no  11678 83600

# Metrics
custom_metrics <- metric_set(accuracy, precision, recall, f_meas)
dt_metrics <- custom_metrics(dt_results,
         truth = response,
         estimate = .pred_class)

# Saving metrics results
dt_metrics <-  pivot_wider(dt_metrics, 
            names_from = .metric, 
            values_from = .estimate) %>% 
  mutate(model = 'decision tree') %>% 
  select(model, accuracy:f_meas)
dt_metrics

8.5 Random Forest

rf_model <- rand_forest(
  mtry = 3,
  trees = 1000,
  min_n = 100
) %>%
  set_mode("classification") %>%
  set_engine("ranger")

start_time <- Sys.time()

library(ranger)
rf_fit <- rf_model %>%
  fit(response ~ .,
      data = df_train)

end_time <- Sys.time()

# Calculate the elapsed time
elapsed_time <- end_time - start_time
print(paste0("Elapsed time to train a random forest model: ", round(elapsed_time, 2), " seconds"))

## [1] "Elapsed time to train a random forest model: 7.39 seconds"

# Predictions 
class_preds <- rf_fit %>% 
  predict(new_data = df_test,
          type = 'class')

# Estimated Probabilities
prob_preds <- rf_fit %>% 
  predict(new_data = df_test,
          type = 'prob')

# Combine results 
rf_results <- df_test %>% 
  select(response) %>% 
  bind_cols(class_preds, prob_preds)

# Confusion Matrix
conf_mat(rf_results,
         truth = response,
         estimate = .pred_class)

##           Truth
## Prediction   yes    no
##        yes   148   159
##        no  11530 83441

# Metrics
custom_metrics <- metric_set(accuracy, precision, recall, f_meas)
rf_metrics <- custom_metrics(rf_results,
         truth = response,
         estimate = .pred_class)

# Saving metrics results
rf_metrics <-  pivot_wider(rf_metrics, 
            names_from = .metric, 
            values_from = .estimate) %>% 
  mutate(model = 'random forest') %>% 
  select(model, accuracy:f_meas)
rf_metrics

8.6 XGBoost

# Model
xgb_model <- boost_tree(
  mtry = 1,
  trees = 5,
  min_n = 1000,
  tree_depth = 4,
  loss_reduction = 0,
  sample_size = 1
  ) %>%
  set_mode("classification") %>%
  set_engine("xgboost",
             cores = 1)

start_time <- Sys.time()

library(xgboost)
xgb_fit <- xgb_model %>%
  fit(response ~ .,
      data = df_train)

## [10:31:45] WARNING: src/learner.cc:767: 
## Parameters: { "cores" } are not used.

end_time <- Sys.time()

# Calculate the elapsed time
elapsed_time <- end_time - start_time
print(paste0("Elapsed time to train a xgboost model: ", round(elapsed_time, 2), " seconds"))

## [1] "Elapsed time to train a xgboost model: 0.63 seconds"

# Predictions 
class_preds <- xgb_fit %>% 
  predict(new_data = df_test,
          type = 'class')

# Estimated Probabilities
prob_preds <- xgb_fit %>% 
  predict(new_data = df_test,
          type = 'prob')

# Combine results 
xgb_results <- df_test %>% 
  select(response) %>% 
  bind_cols(class_preds, prob_preds)

# Confusion Matrix
conf_mat(xgb_results,
         truth = response,
         estimate = .pred_class)

##           Truth
## Prediction   yes    no
##        yes     0     0
##        no  11678 83600

# Metrics
custom_metrics <- metric_set(accuracy, precision, recall, f_meas)
xgb_metrics <- custom_metrics(xgb_results,
         truth = response,
         estimate = .pred_class)

# Saving metrics results
xgb_metrics <-  pivot_wider(xgb_metrics, 
            names_from = .metric, 
            values_from = .estimate) %>% 
  mutate(model = 'xgb') %>% 
  select(model, accuracy:f_meas)

8.7 Model comparison

model_comparison_df <- rbind(lr_metrics,
knn_metrics,
dt_metrics,
rf_metrics,
xgb_metrics) %>% 
  arrange(desc(precision))

model_comparison_df

Health Insurance Cross Sell Project

Prof. Edneide Ramalho

06/08/2023