logo

1 Imports

library(tidyverse)
library(readr)
library(skimr)
library(janitor)
library(moments)
library(summarytools)
library(gridExtra)
library(gtsummary)
library(tidymodels)

2 Data Collection

Os dados podem ser encontrados no Kaggle

df <- read_csv("datasets/archive/train.csv")
glimpse(df)
## Rows: 381,109
## Columns: 12
## $ id                   <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
## $ Gender               <chr> "Male", "Male", "Male", "Male", "Female", "Female…
## $ Age                  <dbl> 44, 76, 47, 21, 29, 24, 23, 56, 24, 32, 47, 24, 4…
## $ Driving_License      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ Region_Code          <dbl> 28, 3, 28, 11, 41, 33, 11, 28, 3, 6, 35, 50, 15, …
## $ Previously_Insured   <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0…
## $ Vehicle_Age          <chr> "> 2 Years", "1-2 Year", "> 2 Years", "< 1 Year",…
## $ Vehicle_Damage       <chr> "Yes", "No", "Yes", "No", "No", "Yes", "Yes", "Ye…
## $ Annual_Premium       <dbl> 40454, 33536, 38294, 28619, 27496, 2630, 23367, 3…
## $ Policy_Sales_Channel <dbl> 26, 26, 26, 152, 152, 160, 152, 26, 152, 152, 124…
## $ Vintage              <dbl> 217, 183, 27, 203, 39, 176, 249, 72, 28, 80, 46, …
## $ Response             <dbl> 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0…

3 Data Description

3.1 Clean and Rename Columns

  • Deixar todos os nomes de variáveis snake_case.

  • Renomar vintage para days_associated

df1 <- janitor::clean_names(df) %>% 
  rename(days_associated = vintage)
names(df1)
##  [1] "id"                   "gender"               "age"                 
##  [4] "driving_license"      "region_code"          "previously_insured"  
##  [7] "vehicle_age"          "vehicle_damage"       "annual_premium"      
## [10] "policy_sales_channel" "days_associated"      "response"

3.2 Data Types

str(df1)
## spc_tbl_ [381,109 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ id                  : num [1:381109] 1 2 3 4 5 6 7 8 9 10 ...
##  $ gender              : chr [1:381109] "Male" "Male" "Male" "Male" ...
##  $ age                 : num [1:381109] 44 76 47 21 29 24 23 56 24 32 ...
##  $ driving_license     : num [1:381109] 1 1 1 1 1 1 1 1 1 1 ...
##  $ region_code         : num [1:381109] 28 3 28 11 41 33 11 28 3 6 ...
##  $ previously_insured  : num [1:381109] 0 0 0 1 1 0 0 0 1 1 ...
##  $ vehicle_age         : chr [1:381109] "> 2 Years" "1-2 Year" "> 2 Years" "< 1 Year" ...
##  $ vehicle_damage      : chr [1:381109] "Yes" "No" "Yes" "No" ...
##  $ annual_premium      : num [1:381109] 40454 33536 38294 28619 27496 ...
##  $ policy_sales_channel: num [1:381109] 26 26 26 152 152 160 152 26 152 152 ...
##  $ days_associated     : num [1:381109] 217 183 27 203 39 176 249 72 28 80 ...
##  $ response            : num [1:381109] 1 0 1 0 0 0 0 1 0 0 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   id = col_double(),
##   ..   Gender = col_character(),
##   ..   Age = col_double(),
##   ..   Driving_License = col_double(),
##   ..   Region_Code = col_double(),
##   ..   Previously_Insured = col_double(),
##   ..   Vehicle_Age = col_character(),
##   ..   Vehicle_Damage = col_character(),
##   ..   Annual_Premium = col_double(),
##   ..   Policy_Sales_Channel = col_double(),
##   ..   Vintage = col_double(),
##   ..   Response = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

Ou, de maneira mais organizada numa tibble:

class_vec <- unlist(lapply(df1, class))
names_vc <- names(df1)
data_types <- tibble(variable = names_vc, type = class_vec)
data_types

Tansformações necessárias:

  • Todas as variáveis do tipo character devem ser transformadas para factor.

  • As variáveis que estão na class numeric podem permanecer como estão, independente de serem float ou integer.

  • Vamos converter variáveis 0 e 1 em no e yes, respectivamente.

  • As variáveis que são categóricas terão suas categorias em letras caixa baixa para padronizar.

  • A variável resposta precisa ter a categoria yes como primeiro nível do fator.

  • Renomear as categorias da variável vehicle_age

df2 <- df1 %>% 
  mutate(
    across(where(is.character), tolower),
         driving_license = ifelse(driving_license == 1, "yes", "no"),
    previously_insured = ifelse(previously_insured == 1, "yes", "no"),
    response = ifelse(response == 1, "yes", "no"),
    vehicle_age = case_when(
      vehicle_age == '> 2 years' ~ 'over_2_years',
      vehicle_age == '1-2 year' ~ 'between_1_2_years',
      vehicle_age == '< 1 year' ~ 'below_1_year'
      )
    ) %>% 
  mutate_if(is.character, as.factor) 
# Check levels of response variables
df2$response %>% 
  levels()
## [1] "no"  "yes"
# Correct order
df2 <- df2 %>% 
  mutate(response = factor(response, levels = c("yes", "no")))

3.3 Columns Description

Column Meaning
id unique customer identifier.
gender client gender: Male / Female.
age customer age.
region_code customer region code.
policy_sales_channel Anonymized Code for the channel of outreaching to the customer. Ie: Over Mail, Over Phone, In Person, etc..
driving_license 0 = customer does not have DL, 1 = already have DL
vehicle_age vehicle age: < 1 Year, 1-2 Year, > 2 Years
vehicle_damage No = customer has never had their vehicle damaged in the past, Yes = has had it.
vehicle_prev_insured 0 = customer does not have vehicle insurance, 1 = already has vehicle insurance.
health_annual_premium annual amount paid by the customer to the company for health insurance. Currency: Rs(Pakistani rupee, R$1.00 = ± Rs0.03).
days_associated number of days since the customer joined the company by purchasing health insurance. The policy is annual.
response 0 = customer is not interested, 1 = customer is interested.

3.4 Data info

skim(df2)
Data summary
Name df2
Number of rows 381109
Number of columns 12
_______________________
Column type frequency:
factor 6
numeric 6
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
gender 0 1 FALSE 2 mal: 206089, fem: 175020
driving_license 0 1 FALSE 2 yes: 380297, no: 812
previously_insured 0 1 FALSE 2 no: 206481, yes: 174628
vehicle_age 0 1 FALSE 3 bet: 200316, bel: 164786, ove: 16007
vehicle_damage 0 1 FALSE 2 yes: 192413, no: 188696
response 0 1 FALSE 2 no: 334399, yes: 46710

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1 190555.00 110016.84 1 95278 190555 285832 381109 ▇▇▇▇▇
age 0 1 38.82 15.51 20 25 36 49 85 ▇▃▃▂▁
region_code 0 1 26.39 13.23 0 15 28 35 52 ▃▂▇▃▃
annual_premium 0 1 30564.39 17213.16 2630 24405 31669 39400 540165 ▇▁▁▁▁
policy_sales_channel 0 1 112.03 54.20 1 29 133 152 163 ▅▁▁▃▇
days_associated 0 1 154.35 83.67 10 82 154 227 299 ▇▇▇▇▇
  • Não temos valores NA, então, não precisamos nos preocupar com imputação de valores faltantes.

3.5 Descriptive Statistics

3.5.1 General

df2 %>% 
  select(-id) %>% 
  tbl_summary(digits = list(all_categorical() ~ c(0, 2)))
Characteristic N = 381,1091
gender
    female 175,020 (45.92%)
    male 206,089 (54.08%)
age 36 (25, 49)
driving_license 380,297 (99.79%)
region_code 28 (15, 35)
previously_insured 174,628 (45.82%)
vehicle_age
    below_1_year 164,786 (43.24%)
    between_1_2_years 200,316 (52.56%)
    over_2_years 16,007 (4.20%)
vehicle_damage 192,413 (50.49%)
annual_premium 31,669 (24,405, 39,400)
policy_sales_channel 133 (29, 152)
days_associated 154 (82, 227)
response 46,710 (12.26%)
1 n (%); Median (IQR)
  • Se quiser ver as categorias explícitas “yes”/“no” temos que especificar as variáveis como categóricas.
df2 %>% 
  select(-id) %>% 
  tbl_summary(type = list(response ~ "categorical",
                          driving_license ~ "categorical",
                          previously_insured ~ "categorical",
                          vehicle_damage ~ "categorical"),
              digits = list(all_categorical() ~ c(0, 2)))
Characteristic N = 381,1091
gender
    female 175,020 (45.92%)
    male 206,089 (54.08%)
age 36 (25, 49)
driving_license
    no 812 (0.21%)
    yes 380,297 (99.79%)
region_code 28 (15, 35)
previously_insured
    no 206,481 (54.18%)
    yes 174,628 (45.82%)
vehicle_age
    below_1_year 164,786 (43.24%)
    between_1_2_years 200,316 (52.56%)
    over_2_years 16,007 (4.20%)
vehicle_damage
    no 188,696 (49.51%)
    yes 192,413 (50.49%)
annual_premium 31,669 (24,405, 39,400)
policy_sales_channel 133 (29, 152)
days_associated 154 (82, 227)
response
    yes 46,710 (12.26%)
    no 334,399 (87.74%)
1 n (%); Median (IQR)

3.5.2 More detailed statistics for numeric variables

num_attributes = df2 %>% 
  select(age, annual_premium, days_associated)
descriptive_table <- descr(num_attributes, style = "rmarkdown")
kable(data.frame(descriptive_table), format = "html") %>%
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
age annual_premium days_associated
Mean 3.882258e+01 3.056439e+04 1.543474e+02
Std.Dev 1.551161e+01 1.721316e+04 8.367130e+01
Min 2.000000e+01 2.630000e+03 1.000000e+01
Q1 2.500000e+01 2.440500e+04 8.200000e+01
Median 3.600000e+01 3.166900e+04 1.540000e+02
Q3 4.900000e+01 3.940000e+04 2.270000e+02
Max 8.500000e+01 5.401650e+05 2.990000e+02
MAD 1.779120e+01 1.112543e+04 1.082298e+02
IQR 2.400000e+01 1.499500e+04 1.450000e+02
CV 3.995512e-01 5.631768e-01 5.420973e-01
Skewness 6.725337e-01 1.766073e+00 3.029500e-03
SE.Skewness 3.967800e-03 3.967800e-03 3.967800e-03
Kurtosis -5.656762e-01 3.400391e+01 -1.200697e+00
N.Valid 3.811090e+05 3.811090e+05 3.811090e+05
Pct.Valid 1.000000e+02 1.000000e+02 1.000000e+02

3.5.3 Visualization

  • Numeric
age_plt <- df2 %>%  
  ggplot(aes(x = age)) +
  geom_histogram(aes(y=after_stat(density)), binwidth = 1, color = "gray", fill = "lightblue", 
                 alpha = 0.5) +
  geom_density(color = "blue") +
  xlab('Age') +
  ylab('Customers by Age (density)') +
  ggtitle('Customers by Age')

paid_plt <- df2 %>%  
  ggplot(aes(x = annual_premium)) +
  geom_histogram(aes(y=after_stat(density)), binwidth = 5000, color = "gray", fill = "lightblue", 
                 alpha = 0.5) +
  geom_density(color = "blue") +
  xlab('Health Annual Paid') +
  ylab('Customers by Health Annual Premium (density)') +
  ggtitle('Customers by Health Annual Paid')

days_plt <- df2 %>%  
  ggplot(aes(x = days_associated)) +
  geom_histogram(aes(y=after_stat(density)), binwidth = 5, color = "gray", fill = "lightblue", 
                 alpha = 0.5) +
  geom_density(color = "blue") +
  xlab('Days Associated') +
  ylab('Customers by Days Associated (density)') +
  ggtitle('Customers by Days Associated')

grid.arrange(age_plt, paid_plt, days_plt, ncol = 3)  

  • Categorical
# variables we don't want to add to cat_attributes
var_exc <- num_attributes %>% names()

cat_attributes = df2 %>% 
  select(-one_of(var_exc), -id)

gender_plt <- cat_attributes %>%  
  ggplot(aes(x = gender)) +
  geom_bar(aes(fill = gender), show.legend = FALSE) 

region_plt <- cat_attributes %>%  
  ggplot(aes(x = factor(region_code))) +
  geom_bar(aes(fill = factor(region_code)), show.legend = FALSE) +
  xlab("region_code")

policy_plt <- cat_attributes %>%  
  ggplot(aes(x = factor(policy_sales_channel))) +
  geom_bar(aes(fill = factor(policy_sales_channel)), show.legend = FALSE) +
  xlab("policy_sales_channel")

license_plt <- cat_attributes %>%  
  ggplot(aes(x = driving_license)) +
  geom_bar(aes(fill = driving_license), show.legend = FALSE) 

vehicle_age_plt <- cat_attributes %>%  
  ggplot(aes(x = vehicle_age)) +
  geom_bar(aes(fill = vehicle_age), show.legend = FALSE) 

damage_plt <- cat_attributes %>%  
  ggplot(aes(x = vehicle_damage)) +
  geom_bar(aes(fill = vehicle_damage), show.legend = FALSE) 


previous_plt <- cat_attributes %>%  
  ggplot(aes(x = previously_insured)) +
  geom_bar(aes(fill = previously_insured), show.legend = FALSE) 

response_plt <- cat_attributes %>%  
  ggplot(aes(x = response)) +
  geom_bar(aes(fill = response), show.legend = FALSE) 

grid.arrange(gender_plt, region_plt, policy_plt, license_plt,
             vehicle_age_plt, damage_plt, previous_plt,
             response_plt, ncol = 2, nrow = 4)    

4 Hypothesis creation

htmltools::img(src = knitr::image_uri("img/mind_map_hypothesis.png"), width = "500", heigth = "500")

Hypotheses Customers:

  • H1) Interest is greater in customers with HIGHER AGE.
  • H2) Interest is greater in FEMALE CUSTOMERS.

Hypotheses Vehicle Insurance:

  • H3) Interest is greater in customers WITH MOST RECENT VEHICLE AGE.

  • H4) Interest is greater in customers who HAD VEHICLE DAMAGE.

  • H5) Interest is greater in Customers who HAD VEHICLE PREVIOUSLY INSURED.

Hypotheses Health Insurance:

  • H6) Interest is greater in customers with HIGHER ANNUAL HEALTH EXPANSES.

  • H7) Interest is greater in customers whth MORE DAYS ASSOCIATED.

Hypotheses Offer Methods: (no data available)

Hypotheses Vehicle: (no data available)

Hypotheses Seasonality: (no data available)

Prioritization of Hypotheses: all can be validated on first crisp cycle.

5 Exploratory Data Analysis

5.1 Univariate Analysis

Done before.

5.2 Bivariate Analysis

  • H1) Interest is greater in customers with HIGHER AGE. ✅
df2 %>% 
  ggplot(aes(x = response, y = age)) +
  stat_boxplot(geom ='errorbar', width = 0.6) + 
  geom_boxplot(aes(fill = response), show.legend = FALSE)

# add a statistical test
df2 %>% 
  select(age, response) %>% 
  tbl_summary(by = response) %>% 
  add_p()
Characteristic yes, N = 46,7101 no, N = 334,3991 p-value2
age 43 (35, 51) 34 (24, 49) <0.001
1 Median (IQR)
2 Wilcoxon rank sum test
# let's take a look on the age distribution for each group
df2 %>% 
  ggplot(aes(age)) +
  geom_histogram(binwidth = 1, color = "gray", fill = "navyblue") +
  facet_grid(rows = vars(response)) +
  ggtitle("Age distribution")

It seems that age has a strong impact on the response. Age distribution is different for interested and not interested customer. The statistical test confirms age is a significant factor for response (p < 0.001). Interested customers are, on average, older than no interested customers. So, we can say this hypothesis is TRUE, interested customers are older.

  • H2) Interest is greater in FEMALE CUSTOMERS.
df2 %>% 
  ggplot(aes(x = gender)) +
  geom_bar(aes(fill = gender), show.legend = FALSE)

df2 %>% 
  ggplot(aes(x = factor(response))) +
  geom_bar(aes(fill = gender), position = 'dodge')

df2 %>% 
  select(gender) %>% 
  tbl_summary()
Characteristic N = 381,1091
gender
    female 175,020 (46%)
    male 206,089 (54%)
1 n (%)
df2 %>% 
  select(response, gender) %>% 
  tbl_summary(by = response) %>% 
  add_p()
Characteristic yes, N = 46,7101 no, N = 334,3991 p-value2
gender <0.001
    female 18,185 (39%) 156,835 (47%)
    male 28,525 (61%) 177,564 (53%)
1 n (%)
2 Pearson’s Chi-squared test

61% of interested customers are male. This hypothesis is FALSE, although, Gender is a statistical significant variable for response (p < 0.001).

H3) Interest is greater in customers WITH MOST RECENT VEHICLE AGE.

df2 %>% 
  ggplot(aes(x = factor(response))) +
  geom_bar(aes(fill = vehicle_age), position = 'dodge')

df2 %>% 
  select(response, vehicle_age) %>% 
  tbl_summary(by = response) %>% 
  add_p()
Characteristic yes, N = 46,7101 no, N = 334,3991 p-value2
vehicle_age <0.001
    below_1_year 7,202 (15%) 157,584 (47%)
    between_1_2_years 34,806 (75%) 165,510 (49%)
    over_2_years 4,702 (10%) 11,305 (3.4%)
1 n (%)
2 Pearson’s Chi-squared test

The most interested customers are the ones with car age between 1 and 2 years (49%). So, this hypothesis is FALSE. Vehicle age has shown to be a significant variable for response.

H4) Interest is greater in customers who HAD VEHICLE DAMAGE.

df2 %>% 
  ggplot(aes(x = response)) +
  geom_bar(aes(fill = vehicle_damage), 
               position = 'dodge')

df2 %>% 
  select(response, vehicle_damage) %>% 
  tbl_summary(by = response) %>% 
  add_p()
Characteristic yes, N = 46,7101 no, N = 334,3991 p-value2
vehicle_damage 45,728 (98%) 146,685 (44%) <0.001
1 n (%)
2 Pearson’s Chi-squared test

98% of interested customer had vehicle damage. So, this hypothesis is TRUE.

H5) Interest is greater in Customers who HAD VEHICLE PREVIOUSLY INSURED.

df2 %>% 
  ggplot(aes(x = factor(response))) +
  geom_bar(aes(fill = factor(previously_insured)), 
           position = 'dodge')

df2 %>% 
  select(response, previously_insured) %>% 
  mutate(previously_insured = factor(previously_insured)) %>% 
  tbl_summary(by = response,
              digits = list(all_categorical() ~ c(0, 2))) %>% 
  add_p()
Characteristic yes, N = 46,7101 no, N = 334,3991 p-value2
previously_insured 158 (0.34%) 174,470 (52.17%) <0.001
1 n (%)
2 Pearson’s Chi-squared test

This hypothesis is FALSE. Only 0.34% of customers interested in insurance had vehicle previously insured.

H6) Interest is greater in customers with HIGHER ANNUAL HEALTH EXPANSES.

df2 %>% 
  ggplot(aes(x = factor(response), y = annual_premium)) +
  geom_boxplot(aes(color = factor(response)), show.legend = FALSE)

df2 %>% 
  select(response, annual_premium) %>% 
  tbl_summary(by = response,
              digits = list(all_categorical() ~ c(0, 2))) %>% 
  add_p()
Characteristic yes, N = 46,7101 no, N = 334,3991 p-value2
annual_premium 33,002 (24,868, 41,297) 31,504 (24,351, 39,120) <0.001
1 Median (IQR)
2 Wilcoxon rank sum test
# Checking annual_premium for each response group
df2 %>% 
  ggplot(aes(annual_premium)) +
  geom_histogram(binwidth = 10000) +
  facet_grid(rows = vars(response))

FALSE. Customers interested in insurance don’t have HIGHER ANNUAL HEALTH EXPENSES. Actually, both groups seems to have, on average, the same expenses. There is a huge amount of outliers in this variable and an enourmous range, what have led to a significant p-value.

H7) Interest is greater in customers whth MORE DAYS ASSOCIATED.

df2 %>% 
  ggplot(aes(x = factor(response), y = days_associated)) +
  geom_boxplot(aes(color = factor(response)), show.legend = FALSE)

df2 %>% 
  select(response, days_associated) %>% 
  tbl_summary(by = response,
              digits = list(all_categorical() ~ c(0, 2))) %>% 
  add_p()
Characteristic yes, N = 46,7101 no, N = 334,3991 p-value2
days_associated 154 (82, 226) 154 (82, 227) 0.5
1 Median (IQR)
2 Wilcoxon rank sum test
# Checking days_associated for each response group
df2 %>% 
  ggplot(aes(days_associated)) +
  geom_histogram(binwidth = 10, color = "gray", fill = "lightblue") +
  facet_grid(rows = vars(response))

FALSE. There is no difference on the number of days associated between the interested and not interested customers groups.

5.3 Hypothesis conclusion

Hypothesis Conclusion Estimated_Relevance
H1 - Interest is greater in customers with HIGHER AGE. True Medium
H2 - Interest is greater in FEMALE CUSTOMERS. False Low
H3 - Interest is greater in customers WITH MOST RECENT VEHICLE AGE. False High
H4 - Interest is greater in customers who HAD VEHICLE DAMAGE. True High
H5 - Interest is greater in Customers who HAD VEHICLE PREVIOUSLY INSURED. False High
H6 - Interest is greater in customers with HIGHER ANNUAL HEALTH EXPANSES. False Low
H7 - Interest is greater in customers with MORE DAYS ASSOCIATED False Low

6 Feature Selection

Vamos usar Random Forest para seleção de variáveis por importância.

library(randomForest)

predictors <- df2 %>% select(gender:days_associated)
target <- df2$response     

start_time <- Sys.time()
rf_model <- randomForest(predictors, target, ntree = 10, 
                         importance = TRUE)
end_time <- Sys.time()
# Calculate the elapsed time
elapsed_time <- end_time - start_time
print(paste("Elapsed time in seconds:", round(elapsed_time, 2)))
## [1] "Elapsed time in seconds: 19.37"
# Access the variable importance scores
importance(rf_model)
##                              yes          no MeanDecreaseAccuracy
## gender                0.06869669   2.5298923            1.9084967
## age                  -2.82413657   5.5316309            6.1802037
## driving_license       0.90537135   0.4203814            0.8396680
## region_code          -4.13522654  12.8723633           15.4708946
## previously_insured   25.25440800 -16.4161763           -7.9808798
## vehicle_age          -6.84508567   8.5334384            9.5968620
## vehicle_damage       25.00425477 -17.5098838          -10.4359392
## annual_premium       -5.75777154   8.0191668            8.2737357
## policy_sales_channel  3.70009752   4.0266010            9.0407372
## days_associated      -0.43306515  -0.4084573           -0.5650863
##                      MeanDecreaseGini
## gender                      733.83820
## age                        6160.04285
## driving_license              38.67545
## region_code                3695.24498
## previously_insured         4702.67217
## vehicle_age                1567.51227
## vehicle_damage             5902.10013
## annual_premium             7438.30521
## policy_sales_channel       2864.53179
## days_associated            7771.61777
# Plot variable importance
varImpPlot(rf_model)

  1. MeanDecreaseAccuracy: This column represents the mean decrease in accuracy caused by randomly permuting the values of the variable across the out-of-bag (OOB) samples. In other words, it measures the decrease in model accuracy when the values of a particular variable are randomly shuffled. Higher values indicate more important variables.

  2. MeanDecreaseGini: This column represents the mean decrease in the Gini impurity caused by randomly permuting the values of the variable across the OOB samples. The Gini impurity measures the node impurity in a decision tree, and higher values indicate more important variables.

Nest primeiro ciclo do CRISP, vamos selecionar as 7 primeiras variáveis do gráfico de MeanDecreaseGini.

df_selected_feat <- df2 %>% 
  select(days_associated, annual_premium, previously_insured, age, vehicle_damage, region_code, policy_sales_channel, response)

7 Data Preparation

7.1 Splitting into training and test datasets

set.seed(1512)

health_split <- df_selected_feat %>% 
  initial_split(prop = 0.75,
                strata = response)

df_train <- health_split %>% 
  training()

df_test <- health_split %>% 
  testing()

7.1.1 Checking train and test sets

# Check number of rows in each dataset
print(paste0("#Rows Train: ", nrow(df_train)))
## [1] "#Rows Train: 285831"
print(paste0("#Rows Test: ", nrow(df_test)))
## [1] "#Rows Test: 95278"
# Distribution of response in training data
df_train %>% 
  select(response) %>% 
  tbl_summary(type = list(response ~ "categorical"))
Characteristic N = 285,8311
response
    yes 35,032 (12%)
    no 250,799 (88%)
1 n (%)
# Distribution of response in test data
df_test %>% 
  select(response) %>% 
  tbl_summary(type = list(response ~ "categorical"))
Characteristic N = 95,2781
response
    yes 11,678 (12%)
    no 83,600 (88%)
1 n (%)

Temos as mesmas proporções de “yes” e “no” nos conjuntos de treino e teste.

7.2 Preprocessing

Vamos usar uma recipe do tidymodels para preparar o pré-processamento das variáveis. Uma vez setada esta recipe, podemos aplicá-la aos dados de treino e teste.

  • Standardization: para variáveis numéricas com distribuição normal

  • Rescaling: para variáveis numéricas com distribuição não normal

  • Encoding: transformar variáveis categóricas em variáveis numérica.

Num primeiro ciclo, vamos fazer as seguintes transformações:

  • Eliminar variáveis altamente correlacionadas.
  • Normalizar todas as variáveis numéricas.
  • One-hot encoding para as categóricas.

7.2.1 Checando se existem variáveis numéricas altamente correlacionadas

df_train %>% 
  select_if(is.numeric) %>% 
  cor()
##                      days_associated annual_premium          age  region_code
## days_associated         1.0000000000   0.0001439562 -0.001303359 -0.002435694
## annual_premium          0.0001439562   1.0000000000  0.068682407 -0.010880993
## age                    -0.0013033590   0.0686824066  1.000000000  0.042007616
## region_code            -0.0024356939  -0.0108809934  0.042007616  1.000000000
## policy_sales_channel    0.0001014691  -0.1128024119 -0.578551159 -0.041932309
##                      policy_sales_channel
## days_associated              0.0001014691
## annual_premium              -0.1128024119
## age                         -0.5785511595
## region_code                 -0.0419323087
## policy_sales_channel         1.0000000000
  • Não há!

7.2.2 Criando os passos para o pré-processamento

df_recipe <- recipe(response ~.,
                    data = df_train) %>%
  step_normalize(all_numeric()) %>%
  step_dummy(all_nominal(), -all_outcomes())
df_recipe

# Train the recipe
df_train_prep <- df_recipe %>%
  prep(training = df_train)
df_train_prep

7.2.3 Apply recipe to training data

df_train_processed <- df_train_prep %>%
  bake(new_data = df_train)
df_train_processed %>% head()

7.2.4 Apply recipe to test data

df_test_processed <- df_train_prep %>%
  bake(new_data = df_test)
df_test_processed %>% head()

8 Models to be tested

  • Logistic Regression ✅

  • KNN ✅

  • Decision Tree ✅

  • Random Forest ✅

  • XGBoost ✅

8.1 Metrics

  • Accuracy, precision, recall and F-1 Score

In this very first moment we are going to fit the models and calculate a unique performance calculation. Afterwards, we are going to consider cross validation and hyperparameter tuning.

8.2 Logistic Regression Model

8.2.1 Model specification

logistic_model <- logistic_reg() %>% 
  set_engine('glm') %>% 
  set_mode('classification')

8.2.2 Model fitting

start_time <- Sys.time()

logistic_fit <- logistic_model %>% 
  fit(response ~ .,
      data = df_train)

end_time <- Sys.time()

# Calculate the elapsed time
elapsed_time <- end_time - start_time
print(paste0("Elapsed time to train a logsitc model: ", round(elapsed_time, 2), " seconds"))
## [1] "Elapsed time to train a logsitc model: 1.84 seconds"

8.2.3 Predict outcome categories

class_preds <- logistic_fit %>% 
  predict(new_data = df_test,
          type = 'class')
class_preds

8.2.4 Estimated probabilities

prob_preds <- logistic_fit %>% 
  predict(new_data = df_test,
          type = 'prob')
prob_preds

8.2.5 Combining results

insurance_results <- df_test %>% 
  select(response) %>% 
  bind_cols(class_preds, prob_preds)

insurance_results 

8.2.6 Confusion matrix

conf_mat(insurance_results,
         truth = response,
         estimate = .pred_class)
##           Truth
## Prediction   yes    no
##        yes     0     0
##        no  11678 83600

8.2.7 Creating a metric set

custom_metrics <- metric_set(accuracy, precision, recall, f_meas)
lr_metrics <- custom_metrics(insurance_results,
         truth = response,
         estimate = .pred_class)
lr_metrics
# ROC Curve
insurance_results %>% 
  roc_curve(truth = response, .pred_yes) %>% 
  autoplot()

# Calculating AUC
roc_auc(insurance_results,
        truth = response,
        .pred_yes)
# Saving metrics results
lr_metrics <-  pivot_wider(lr_metrics, 
            names_from = .metric, 
            values_from = .estimate) %>% 
  mutate(model = 'logistic regression') %>% 
  select(model, accuracy:f_meas)
lr_metrics

8.3 KNN Model

More info: https://parsnip.tidymodels.org/reference/nearest_neighbor.html

8.3.1 Model specification

knn_model <- nearest_neighbor(weight_func = "rectangular", 
                              neighbors = 3) %>% 
  set_mode("classification") %>% 
  set_engine("kknn")

8.3.2 Model fitting

start_time <- Sys.time()

library(kknn)
knn_fit <- knn_model %>%
  fit(response ~ .,
      data = df_train)

end_time <- Sys.time()

# Calculate the elapsed time
elapsed_time <- end_time - start_time
print(paste0("Elapsed time to train a knn model: ", round(elapsed_time, 2), " seconds"))
## [1] "Elapsed time to train a knn model: 10.21 seconds"
# Predictions 
class_preds <- knn_fit %>% 
  predict(new_data = df_test,
          type = 'class')
# Estimated Probabilities
prob_preds <- knn_fit %>% 
  predict(new_data = df_test,
          type = 'prob')

# Combine results 
knn_results <- df_test %>% 
  select(response) %>% 
  bind_cols(class_preds, prob_preds)
# Confusion Matrix
conf_mat(knn_results,
         truth = response,
         estimate = .pred_class)
##           Truth
## Prediction   yes    no
##        yes  2468  5507
##        no   9210 78093
# Metrics
custom_metrics <- metric_set(accuracy, precision, recall, f_meas)
knn_metrics <- custom_metrics(knn_results,
         truth = response,
         estimate = .pred_class)

# Saving metrics results
knn_metrics <-  pivot_wider(knn_metrics, 
            names_from = .metric, 
            values_from = .estimate) %>% 
  mutate(model = 'KNN') %>% 
  select(model, accuracy:f_meas)
knn_metrics

8.4 Decision Tree

# Model Specification
dt_model <- decision_tree(tree_depth = 10) %>%
  set_engine("rpart") %>%
  set_mode("classification")
# Model fitting
start_time <- Sys.time()

dt_fit <- dt_model %>%
  fit(response ~ .,
      data = df_train)

end_time <- Sys.time()

# Calculate the elapsed time
elapsed_time <- end_time - start_time
print(paste0("Elapsed time to train a decision tree model: ", round(elapsed_time, 2)))
## [1] "Elapsed time to train a decision tree model: 0.69"
dt_fit
## parsnip model object
## 
## n= 285831 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 285831 35032 no (0.1225619 0.8774381) *
library(rpart.plot)
dt_fit %>%
  extract_fit_engine() %>%
  rpart.plot()

# Predictions 
class_preds <- dt_fit %>% 
  predict(new_data = df_test,
          type = 'class')
# Estimated Probabilities
prob_preds <- dt_fit %>% 
  predict(new_data = df_test,
          type = 'prob')

# Combine results 
dt_results <- df_test %>% 
  select(response) %>% 
  bind_cols(class_preds, prob_preds)
# Confusion Matrix
conf_mat(dt_results,
         truth = response,
         estimate = .pred_class)
##           Truth
## Prediction   yes    no
##        yes     0     0
##        no  11678 83600
# Metrics
custom_metrics <- metric_set(accuracy, precision, recall, f_meas)
dt_metrics <- custom_metrics(dt_results,
         truth = response,
         estimate = .pred_class)

# Saving metrics results
dt_metrics <-  pivot_wider(dt_metrics, 
            names_from = .metric, 
            values_from = .estimate) %>% 
  mutate(model = 'decision tree') %>% 
  select(model, accuracy:f_meas)
dt_metrics

8.5 Random Forest

rf_model <- rand_forest(
  mtry = 3,
  trees = 1000,
  min_n = 100
) %>%
  set_mode("classification") %>%
  set_engine("ranger")
start_time <- Sys.time()

library(ranger)
rf_fit <- rf_model %>%
  fit(response ~ .,
      data = df_train)

end_time <- Sys.time()

# Calculate the elapsed time
elapsed_time <- end_time - start_time
print(paste0("Elapsed time to train a random forest model: ", round(elapsed_time, 2), " seconds"))
## [1] "Elapsed time to train a random forest model: 7.39 seconds"
# Predictions 
class_preds <- rf_fit %>% 
  predict(new_data = df_test,
          type = 'class')

# Estimated Probabilities
prob_preds <- rf_fit %>% 
  predict(new_data = df_test,
          type = 'prob')

# Combine results 
rf_results <- df_test %>% 
  select(response) %>% 
  bind_cols(class_preds, prob_preds)

# Confusion Matrix
conf_mat(rf_results,
         truth = response,
         estimate = .pred_class)
##           Truth
## Prediction   yes    no
##        yes   148   159
##        no  11530 83441
# Metrics
custom_metrics <- metric_set(accuracy, precision, recall, f_meas)
rf_metrics <- custom_metrics(rf_results,
         truth = response,
         estimate = .pred_class)

# Saving metrics results
rf_metrics <-  pivot_wider(rf_metrics, 
            names_from = .metric, 
            values_from = .estimate) %>% 
  mutate(model = 'random forest') %>% 
  select(model, accuracy:f_meas)
rf_metrics

8.6 XGBoost

# Model
xgb_model <- boost_tree(
  mtry = 1,
  trees = 5,
  min_n = 1000,
  tree_depth = 4,
  loss_reduction = 0,
  sample_size = 1
  ) %>%
  set_mode("classification") %>%
  set_engine("xgboost",
             cores = 1)
start_time <- Sys.time()

library(xgboost)
xgb_fit <- xgb_model %>%
  fit(response ~ .,
      data = df_train)
## [10:31:45] WARNING: src/learner.cc:767: 
## Parameters: { "cores" } are not used.
end_time <- Sys.time()

# Calculate the elapsed time
elapsed_time <- end_time - start_time
print(paste0("Elapsed time to train a xgboost model: ", round(elapsed_time, 2), " seconds"))
## [1] "Elapsed time to train a xgboost model: 0.63 seconds"
# Predictions 
class_preds <- xgb_fit %>% 
  predict(new_data = df_test,
          type = 'class')

# Estimated Probabilities
prob_preds <- xgb_fit %>% 
  predict(new_data = df_test,
          type = 'prob')

# Combine results 
xgb_results <- df_test %>% 
  select(response) %>% 
  bind_cols(class_preds, prob_preds)

# Confusion Matrix
conf_mat(xgb_results,
         truth = response,
         estimate = .pred_class)
##           Truth
## Prediction   yes    no
##        yes     0     0
##        no  11678 83600
# Metrics
custom_metrics <- metric_set(accuracy, precision, recall, f_meas)
xgb_metrics <- custom_metrics(xgb_results,
         truth = response,
         estimate = .pred_class)

# Saving metrics results
xgb_metrics <-  pivot_wider(xgb_metrics, 
            names_from = .metric, 
            values_from = .estimate) %>% 
  mutate(model = 'xgb') %>% 
  select(model, accuracy:f_meas)

8.7 Model comparison

model_comparison_df <- rbind(lr_metrics,
knn_metrics,
dt_metrics,
rf_metrics,
xgb_metrics) %>% 
  arrange(desc(precision))

model_comparison_df