library(tidyverse)
library(readr)
library(skimr)
library(janitor)
library(moments)
library(summarytools)
library(gridExtra)
library(gtsummary)
library(tidymodels)
Os dados podem ser encontrados no Kaggle
df <- read_csv("datasets/archive/train.csv")
glimpse(df)
## Rows: 381,109
## Columns: 12
## $ id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
## $ Gender <chr> "Male", "Male", "Male", "Male", "Female", "Female…
## $ Age <dbl> 44, 76, 47, 21, 29, 24, 23, 56, 24, 32, 47, 24, 4…
## $ Driving_License <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ Region_Code <dbl> 28, 3, 28, 11, 41, 33, 11, 28, 3, 6, 35, 50, 15, …
## $ Previously_Insured <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0…
## $ Vehicle_Age <chr> "> 2 Years", "1-2 Year", "> 2 Years", "< 1 Year",…
## $ Vehicle_Damage <chr> "Yes", "No", "Yes", "No", "No", "Yes", "Yes", "Ye…
## $ Annual_Premium <dbl> 40454, 33536, 38294, 28619, 27496, 2630, 23367, 3…
## $ Policy_Sales_Channel <dbl> 26, 26, 26, 152, 152, 160, 152, 26, 152, 152, 124…
## $ Vintage <dbl> 217, 183, 27, 203, 39, 176, 249, 72, 28, 80, 46, …
## $ Response <dbl> 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0…
Deixar todos os nomes de variáveis snake_case.
Renomar vintage
para
days_associated
df1 <- janitor::clean_names(df) %>%
rename(days_associated = vintage)
names(df1)
## [1] "id" "gender" "age"
## [4] "driving_license" "region_code" "previously_insured"
## [7] "vehicle_age" "vehicle_damage" "annual_premium"
## [10] "policy_sales_channel" "days_associated" "response"
str(df1)
## spc_tbl_ [381,109 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ id : num [1:381109] 1 2 3 4 5 6 7 8 9 10 ...
## $ gender : chr [1:381109] "Male" "Male" "Male" "Male" ...
## $ age : num [1:381109] 44 76 47 21 29 24 23 56 24 32 ...
## $ driving_license : num [1:381109] 1 1 1 1 1 1 1 1 1 1 ...
## $ region_code : num [1:381109] 28 3 28 11 41 33 11 28 3 6 ...
## $ previously_insured : num [1:381109] 0 0 0 1 1 0 0 0 1 1 ...
## $ vehicle_age : chr [1:381109] "> 2 Years" "1-2 Year" "> 2 Years" "< 1 Year" ...
## $ vehicle_damage : chr [1:381109] "Yes" "No" "Yes" "No" ...
## $ annual_premium : num [1:381109] 40454 33536 38294 28619 27496 ...
## $ policy_sales_channel: num [1:381109] 26 26 26 152 152 160 152 26 152 152 ...
## $ days_associated : num [1:381109] 217 183 27 203 39 176 249 72 28 80 ...
## $ response : num [1:381109] 1 0 1 0 0 0 0 1 0 0 ...
## - attr(*, "spec")=
## .. cols(
## .. id = col_double(),
## .. Gender = col_character(),
## .. Age = col_double(),
## .. Driving_License = col_double(),
## .. Region_Code = col_double(),
## .. Previously_Insured = col_double(),
## .. Vehicle_Age = col_character(),
## .. Vehicle_Damage = col_character(),
## .. Annual_Premium = col_double(),
## .. Policy_Sales_Channel = col_double(),
## .. Vintage = col_double(),
## .. Response = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
Ou, de maneira mais organizada numa tibble:
class_vec <- unlist(lapply(df1, class))
names_vc <- names(df1)
data_types <- tibble(variable = names_vc, type = class_vec)
data_types
Tansformações necessárias:
Todas as variáveis do tipo character
devem ser
transformadas para factor
.
As variáveis que estão na class numeric
podem
permanecer como estão, independente de serem float
ou
integer
.
Vamos converter variáveis 0 e 1 em no e yes, respectivamente.
As variáveis que são categóricas terão suas categorias em letras caixa baixa para padronizar.
A variável resposta precisa ter a categoria yes como primeiro nível do fator.
Renomear as categorias da variável
vehicle_age
df2 <- df1 %>%
mutate(
across(where(is.character), tolower),
driving_license = ifelse(driving_license == 1, "yes", "no"),
previously_insured = ifelse(previously_insured == 1, "yes", "no"),
response = ifelse(response == 1, "yes", "no"),
vehicle_age = case_when(
vehicle_age == '> 2 years' ~ 'over_2_years',
vehicle_age == '1-2 year' ~ 'between_1_2_years',
vehicle_age == '< 1 year' ~ 'below_1_year'
)
) %>%
mutate_if(is.character, as.factor)
# Check levels of response variables
df2$response %>%
levels()
## [1] "no" "yes"
# Correct order
df2 <- df2 %>%
mutate(response = factor(response, levels = c("yes", "no")))
Column | Meaning |
---|---|
id | unique customer identifier. |
gender | client gender: Male / Female. |
age | customer age. |
region_code | customer region code. |
policy_sales_channel | Anonymized Code for the channel of outreaching to the customer. Ie: Over Mail, Over Phone, In Person, etc.. |
driving_license | 0 = customer does not have DL, 1 = already have DL |
vehicle_age | vehicle age: < 1 Year, 1-2 Year, > 2 Years |
vehicle_damage | No = customer has never had their vehicle damaged in the past, Yes = has had it. |
vehicle_prev_insured | 0 = customer does not have vehicle insurance, 1 = already has vehicle insurance. |
health_annual_premium | annual amount paid by the customer to the company for health insurance. Currency: Rs(Pakistani rupee, R$1.00 = ± Rs0.03). |
days_associated | number of days since the customer joined the company by purchasing health insurance. The policy is annual. |
response | 0 = customer is not interested, 1 = customer is interested. |
skim(df2)
Name | df2 |
Number of rows | 381109 |
Number of columns | 12 |
_______________________ | |
Column type frequency: | |
factor | 6 |
numeric | 6 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
gender | 0 | 1 | FALSE | 2 | mal: 206089, fem: 175020 |
driving_license | 0 | 1 | FALSE | 2 | yes: 380297, no: 812 |
previously_insured | 0 | 1 | FALSE | 2 | no: 206481, yes: 174628 |
vehicle_age | 0 | 1 | FALSE | 3 | bet: 200316, bel: 164786, ove: 16007 |
vehicle_damage | 0 | 1 | FALSE | 2 | yes: 192413, no: 188696 |
response | 0 | 1 | FALSE | 2 | no: 334399, yes: 46710 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
id | 0 | 1 | 190555.00 | 110016.84 | 1 | 95278 | 190555 | 285832 | 381109 | ▇▇▇▇▇ |
age | 0 | 1 | 38.82 | 15.51 | 20 | 25 | 36 | 49 | 85 | ▇▃▃▂▁ |
region_code | 0 | 1 | 26.39 | 13.23 | 0 | 15 | 28 | 35 | 52 | ▃▂▇▃▃ |
annual_premium | 0 | 1 | 30564.39 | 17213.16 | 2630 | 24405 | 31669 | 39400 | 540165 | ▇▁▁▁▁ |
policy_sales_channel | 0 | 1 | 112.03 | 54.20 | 1 | 29 | 133 | 152 | 163 | ▅▁▁▃▇ |
days_associated | 0 | 1 | 154.35 | 83.67 | 10 | 82 | 154 | 227 | 299 | ▇▇▇▇▇ |
df2 %>%
select(-id) %>%
tbl_summary(digits = list(all_categorical() ~ c(0, 2)))
Characteristic | N = 381,1091 |
---|---|
gender | |
female | 175,020 (45.92%) |
male | 206,089 (54.08%) |
age | 36 (25, 49) |
driving_license | 380,297 (99.79%) |
region_code | 28 (15, 35) |
previously_insured | 174,628 (45.82%) |
vehicle_age | |
below_1_year | 164,786 (43.24%) |
between_1_2_years | 200,316 (52.56%) |
over_2_years | 16,007 (4.20%) |
vehicle_damage | 192,413 (50.49%) |
annual_premium | 31,669 (24,405, 39,400) |
policy_sales_channel | 133 (29, 152) |
days_associated | 154 (82, 227) |
response | 46,710 (12.26%) |
1 n (%); Median (IQR) |
df2 %>%
select(-id) %>%
tbl_summary(type = list(response ~ "categorical",
driving_license ~ "categorical",
previously_insured ~ "categorical",
vehicle_damage ~ "categorical"),
digits = list(all_categorical() ~ c(0, 2)))
Characteristic | N = 381,1091 |
---|---|
gender | |
female | 175,020 (45.92%) |
male | 206,089 (54.08%) |
age | 36 (25, 49) |
driving_license | |
no | 812 (0.21%) |
yes | 380,297 (99.79%) |
region_code | 28 (15, 35) |
previously_insured | |
no | 206,481 (54.18%) |
yes | 174,628 (45.82%) |
vehicle_age | |
below_1_year | 164,786 (43.24%) |
between_1_2_years | 200,316 (52.56%) |
over_2_years | 16,007 (4.20%) |
vehicle_damage | |
no | 188,696 (49.51%) |
yes | 192,413 (50.49%) |
annual_premium | 31,669 (24,405, 39,400) |
policy_sales_channel | 133 (29, 152) |
days_associated | 154 (82, 227) |
response | |
yes | 46,710 (12.26%) |
no | 334,399 (87.74%) |
1 n (%); Median (IQR) |
num_attributes = df2 %>%
select(age, annual_premium, days_associated)
descriptive_table <- descr(num_attributes, style = "rmarkdown")
kable(data.frame(descriptive_table), format = "html") %>%
kable_styling(bootstrap_options = "striped", full_width = FALSE)
age | annual_premium | days_associated | |
---|---|---|---|
Mean | 3.882258e+01 | 3.056439e+04 | 1.543474e+02 |
Std.Dev | 1.551161e+01 | 1.721316e+04 | 8.367130e+01 |
Min | 2.000000e+01 | 2.630000e+03 | 1.000000e+01 |
Q1 | 2.500000e+01 | 2.440500e+04 | 8.200000e+01 |
Median | 3.600000e+01 | 3.166900e+04 | 1.540000e+02 |
Q3 | 4.900000e+01 | 3.940000e+04 | 2.270000e+02 |
Max | 8.500000e+01 | 5.401650e+05 | 2.990000e+02 |
MAD | 1.779120e+01 | 1.112543e+04 | 1.082298e+02 |
IQR | 2.400000e+01 | 1.499500e+04 | 1.450000e+02 |
CV | 3.995512e-01 | 5.631768e-01 | 5.420973e-01 |
Skewness | 6.725337e-01 | 1.766073e+00 | 3.029500e-03 |
SE.Skewness | 3.967800e-03 | 3.967800e-03 | 3.967800e-03 |
Kurtosis | -5.656762e-01 | 3.400391e+01 | -1.200697e+00 |
N.Valid | 3.811090e+05 | 3.811090e+05 | 3.811090e+05 |
Pct.Valid | 1.000000e+02 | 1.000000e+02 | 1.000000e+02 |
age_plt <- df2 %>%
ggplot(aes(x = age)) +
geom_histogram(aes(y=after_stat(density)), binwidth = 1, color = "gray", fill = "lightblue",
alpha = 0.5) +
geom_density(color = "blue") +
xlab('Age') +
ylab('Customers by Age (density)') +
ggtitle('Customers by Age')
paid_plt <- df2 %>%
ggplot(aes(x = annual_premium)) +
geom_histogram(aes(y=after_stat(density)), binwidth = 5000, color = "gray", fill = "lightblue",
alpha = 0.5) +
geom_density(color = "blue") +
xlab('Health Annual Paid') +
ylab('Customers by Health Annual Premium (density)') +
ggtitle('Customers by Health Annual Paid')
days_plt <- df2 %>%
ggplot(aes(x = days_associated)) +
geom_histogram(aes(y=after_stat(density)), binwidth = 5, color = "gray", fill = "lightblue",
alpha = 0.5) +
geom_density(color = "blue") +
xlab('Days Associated') +
ylab('Customers by Days Associated (density)') +
ggtitle('Customers by Days Associated')
grid.arrange(age_plt, paid_plt, days_plt, ncol = 3)
# variables we don't want to add to cat_attributes
var_exc <- num_attributes %>% names()
cat_attributes = df2 %>%
select(-one_of(var_exc), -id)
gender_plt <- cat_attributes %>%
ggplot(aes(x = gender)) +
geom_bar(aes(fill = gender), show.legend = FALSE)
region_plt <- cat_attributes %>%
ggplot(aes(x = factor(region_code))) +
geom_bar(aes(fill = factor(region_code)), show.legend = FALSE) +
xlab("region_code")
policy_plt <- cat_attributes %>%
ggplot(aes(x = factor(policy_sales_channel))) +
geom_bar(aes(fill = factor(policy_sales_channel)), show.legend = FALSE) +
xlab("policy_sales_channel")
license_plt <- cat_attributes %>%
ggplot(aes(x = driving_license)) +
geom_bar(aes(fill = driving_license), show.legend = FALSE)
vehicle_age_plt <- cat_attributes %>%
ggplot(aes(x = vehicle_age)) +
geom_bar(aes(fill = vehicle_age), show.legend = FALSE)
damage_plt <- cat_attributes %>%
ggplot(aes(x = vehicle_damage)) +
geom_bar(aes(fill = vehicle_damage), show.legend = FALSE)
previous_plt <- cat_attributes %>%
ggplot(aes(x = previously_insured)) +
geom_bar(aes(fill = previously_insured), show.legend = FALSE)
response_plt <- cat_attributes %>%
ggplot(aes(x = response)) +
geom_bar(aes(fill = response), show.legend = FALSE)
grid.arrange(gender_plt, region_plt, policy_plt, license_plt,
vehicle_age_plt, damage_plt, previous_plt,
response_plt, ncol = 2, nrow = 4)
htmltools::img(src = knitr::image_uri("img/mind_map_hypothesis.png"), width = "500", heigth = "500")
Hypotheses Customers:
Hypotheses Vehicle Insurance:
H3) Interest is greater in customers WITH MOST RECENT VEHICLE AGE.
H4) Interest is greater in customers who HAD VEHICLE DAMAGE.
H5) Interest is greater in Customers who HAD VEHICLE PREVIOUSLY INSURED.
Hypotheses Health Insurance:
H6) Interest is greater in customers with HIGHER ANNUAL HEALTH EXPANSES.
H7) Interest is greater in customers whth MORE DAYS ASSOCIATED.
Hypotheses Offer Methods: (no data available)
Hypotheses Vehicle: (no data available)
Hypotheses Seasonality: (no data available)
Prioritization of Hypotheses: all can be validated on first crisp cycle.
Done before.
df2 %>%
ggplot(aes(x = response, y = age)) +
stat_boxplot(geom ='errorbar', width = 0.6) +
geom_boxplot(aes(fill = response), show.legend = FALSE)
# add a statistical test
df2 %>%
select(age, response) %>%
tbl_summary(by = response) %>%
add_p()
Characteristic | yes, N = 46,7101 | no, N = 334,3991 | p-value2 |
---|---|---|---|
age | 43 (35, 51) | 34 (24, 49) | <0.001 |
1 Median (IQR) | |||
2 Wilcoxon rank sum test |
# let's take a look on the age distribution for each group
df2 %>%
ggplot(aes(age)) +
geom_histogram(binwidth = 1, color = "gray", fill = "navyblue") +
facet_grid(rows = vars(response)) +
ggtitle("Age distribution")
It seems that age has a strong impact on the response. Age distribution is different for interested and not interested customer. The statistical test confirms age is a significant factor for response (p < 0.001). Interested customers are, on average, older than no interested customers. So, we can say this hypothesis is TRUE, interested customers are older.
df2 %>%
ggplot(aes(x = gender)) +
geom_bar(aes(fill = gender), show.legend = FALSE)
df2 %>%
ggplot(aes(x = factor(response))) +
geom_bar(aes(fill = gender), position = 'dodge')
df2 %>%
select(gender) %>%
tbl_summary()
Characteristic | N = 381,1091 |
---|---|
gender | |
female | 175,020 (46%) |
male | 206,089 (54%) |
1 n (%) |
df2 %>%
select(response, gender) %>%
tbl_summary(by = response) %>%
add_p()
Characteristic | yes, N = 46,7101 | no, N = 334,3991 | p-value2 |
---|---|---|---|
gender | <0.001 | ||
female | 18,185 (39%) | 156,835 (47%) | |
male | 28,525 (61%) | 177,564 (53%) | |
1 n (%) | |||
2 Pearson’s Chi-squared test |
61% of interested customers are male. This hypothesis is FALSE, although, Gender is a statistical significant variable for response (p < 0.001).
H3) Interest is greater in customers WITH MOST RECENT VEHICLE AGE. ❌
df2 %>%
ggplot(aes(x = factor(response))) +
geom_bar(aes(fill = vehicle_age), position = 'dodge')
df2 %>%
select(response, vehicle_age) %>%
tbl_summary(by = response) %>%
add_p()
Characteristic | yes, N = 46,7101 | no, N = 334,3991 | p-value2 |
---|---|---|---|
vehicle_age | <0.001 | ||
below_1_year | 7,202 (15%) | 157,584 (47%) | |
between_1_2_years | 34,806 (75%) | 165,510 (49%) | |
over_2_years | 4,702 (10%) | 11,305 (3.4%) | |
1 n (%) | |||
2 Pearson’s Chi-squared test |
The most interested customers are the ones with car age between 1 and 2 years (49%). So, this hypothesis is FALSE. Vehicle age has shown to be a significant variable for response.
H4) Interest is greater in customers who HAD VEHICLE DAMAGE. ✅
df2 %>%
ggplot(aes(x = response)) +
geom_bar(aes(fill = vehicle_damage),
position = 'dodge')
df2 %>%
select(response, vehicle_damage) %>%
tbl_summary(by = response) %>%
add_p()
Characteristic | yes, N = 46,7101 | no, N = 334,3991 | p-value2 |
---|---|---|---|
vehicle_damage | 45,728 (98%) | 146,685 (44%) | <0.001 |
1 n (%) | |||
2 Pearson’s Chi-squared test |
98% of interested customer had vehicle damage. So, this hypothesis is TRUE.
H5) Interest is greater in Customers who HAD VEHICLE PREVIOUSLY INSURED. ❌
df2 %>%
ggplot(aes(x = factor(response))) +
geom_bar(aes(fill = factor(previously_insured)),
position = 'dodge')
df2 %>%
select(response, previously_insured) %>%
mutate(previously_insured = factor(previously_insured)) %>%
tbl_summary(by = response,
digits = list(all_categorical() ~ c(0, 2))) %>%
add_p()
Characteristic | yes, N = 46,7101 | no, N = 334,3991 | p-value2 |
---|---|---|---|
previously_insured | 158 (0.34%) | 174,470 (52.17%) | <0.001 |
1 n (%) | |||
2 Pearson’s Chi-squared test |
This hypothesis is FALSE. Only 0.34% of customers interested in insurance had vehicle previously insured.
H6) Interest is greater in customers with HIGHER ANNUAL HEALTH EXPANSES. ❌
df2 %>%
ggplot(aes(x = factor(response), y = annual_premium)) +
geom_boxplot(aes(color = factor(response)), show.legend = FALSE)
df2 %>%
select(response, annual_premium) %>%
tbl_summary(by = response,
digits = list(all_categorical() ~ c(0, 2))) %>%
add_p()
Characteristic | yes, N = 46,7101 | no, N = 334,3991 | p-value2 |
---|---|---|---|
annual_premium | 33,002 (24,868, 41,297) | 31,504 (24,351, 39,120) | <0.001 |
1 Median (IQR) | |||
2 Wilcoxon rank sum test |
# Checking annual_premium for each response group
df2 %>%
ggplot(aes(annual_premium)) +
geom_histogram(binwidth = 10000) +
facet_grid(rows = vars(response))
FALSE. Customers interested in insurance don’t have HIGHER ANNUAL HEALTH EXPENSES. Actually, both groups seems to have, on average, the same expenses. There is a huge amount of outliers in this variable and an enourmous range, what have led to a significant p-value.
H7) Interest is greater in customers whth MORE DAYS ASSOCIATED. ❌
df2 %>%
ggplot(aes(x = factor(response), y = days_associated)) +
geom_boxplot(aes(color = factor(response)), show.legend = FALSE)
df2 %>%
select(response, days_associated) %>%
tbl_summary(by = response,
digits = list(all_categorical() ~ c(0, 2))) %>%
add_p()
Characteristic | yes, N = 46,7101 | no, N = 334,3991 | p-value2 |
---|---|---|---|
days_associated | 154 (82, 226) | 154 (82, 227) | 0.5 |
1 Median (IQR) | |||
2 Wilcoxon rank sum test |
# Checking days_associated for each response group
df2 %>%
ggplot(aes(days_associated)) +
geom_histogram(binwidth = 10, color = "gray", fill = "lightblue") +
facet_grid(rows = vars(response))
FALSE. There is no difference on the number of days associated between the interested and not interested customers groups.
Hypothesis | Conclusion | Estimated_Relevance |
---|---|---|
H1 - Interest is greater in customers with HIGHER AGE. | True | Medium |
H2 - Interest is greater in FEMALE CUSTOMERS. | False | Low |
H3 - Interest is greater in customers WITH MOST RECENT VEHICLE AGE. | False | High |
H4 - Interest is greater in customers who HAD VEHICLE DAMAGE. | True | High |
H5 - Interest is greater in Customers who HAD VEHICLE PREVIOUSLY INSURED. | False | High |
H6 - Interest is greater in customers with HIGHER ANNUAL HEALTH EXPANSES. | False | Low |
H7 - Interest is greater in customers with MORE DAYS ASSOCIATED | False | Low |
Vamos usar Random Forest para seleção de variáveis por importância.
library(randomForest)
predictors <- df2 %>% select(gender:days_associated)
target <- df2$response
start_time <- Sys.time()
rf_model <- randomForest(predictors, target, ntree = 10,
importance = TRUE)
end_time <- Sys.time()
# Calculate the elapsed time
elapsed_time <- end_time - start_time
print(paste("Elapsed time in seconds:", round(elapsed_time, 2)))
## [1] "Elapsed time in seconds: 19.37"
# Access the variable importance scores
importance(rf_model)
## yes no MeanDecreaseAccuracy
## gender 0.06869669 2.5298923 1.9084967
## age -2.82413657 5.5316309 6.1802037
## driving_license 0.90537135 0.4203814 0.8396680
## region_code -4.13522654 12.8723633 15.4708946
## previously_insured 25.25440800 -16.4161763 -7.9808798
## vehicle_age -6.84508567 8.5334384 9.5968620
## vehicle_damage 25.00425477 -17.5098838 -10.4359392
## annual_premium -5.75777154 8.0191668 8.2737357
## policy_sales_channel 3.70009752 4.0266010 9.0407372
## days_associated -0.43306515 -0.4084573 -0.5650863
## MeanDecreaseGini
## gender 733.83820
## age 6160.04285
## driving_license 38.67545
## region_code 3695.24498
## previously_insured 4702.67217
## vehicle_age 1567.51227
## vehicle_damage 5902.10013
## annual_premium 7438.30521
## policy_sales_channel 2864.53179
## days_associated 7771.61777
# Plot variable importance
varImpPlot(rf_model)
MeanDecreaseAccuracy: This column represents the mean decrease in accuracy caused by randomly permuting the values of the variable across the out-of-bag (OOB) samples. In other words, it measures the decrease in model accuracy when the values of a particular variable are randomly shuffled. Higher values indicate more important variables.
MeanDecreaseGini: This column represents the mean decrease in the Gini impurity caused by randomly permuting the values of the variable across the OOB samples. The Gini impurity measures the node impurity in a decision tree, and higher values indicate more important variables.
Nest primeiro ciclo do CRISP, vamos selecionar as 7 primeiras variáveis do gráfico de MeanDecreaseGini.
df_selected_feat <- df2 %>%
select(days_associated, annual_premium, previously_insured, age, vehicle_damage, region_code, policy_sales_channel, response)
set.seed(1512)
health_split <- df_selected_feat %>%
initial_split(prop = 0.75,
strata = response)
df_train <- health_split %>%
training()
df_test <- health_split %>%
testing()
# Check number of rows in each dataset
print(paste0("#Rows Train: ", nrow(df_train)))
## [1] "#Rows Train: 285831"
print(paste0("#Rows Test: ", nrow(df_test)))
## [1] "#Rows Test: 95278"
# Distribution of response in training data
df_train %>%
select(response) %>%
tbl_summary(type = list(response ~ "categorical"))
Characteristic | N = 285,8311 |
---|---|
response | |
yes | 35,032 (12%) |
no | 250,799 (88%) |
1 n (%) |
# Distribution of response in test data
df_test %>%
select(response) %>%
tbl_summary(type = list(response ~ "categorical"))
Characteristic | N = 95,2781 |
---|---|
response | |
yes | 11,678 (12%) |
no | 83,600 (88%) |
1 n (%) |
Temos as mesmas proporções de “yes” e “no” nos conjuntos de treino e teste.
Vamos usar uma recipe
do tidymodels para preparar o
pré-processamento das variáveis. Uma vez setada esta
recipe
, podemos aplicá-la aos dados de treino e teste.
Standardization: para variáveis numéricas com distribuição normal
Rescaling: para variáveis numéricas com distribuição não normal
Encoding: transformar variáveis categóricas em variáveis numérica.
Num primeiro ciclo, vamos fazer as seguintes transformações:
df_train %>%
select_if(is.numeric) %>%
cor()
## days_associated annual_premium age region_code
## days_associated 1.0000000000 0.0001439562 -0.001303359 -0.002435694
## annual_premium 0.0001439562 1.0000000000 0.068682407 -0.010880993
## age -0.0013033590 0.0686824066 1.000000000 0.042007616
## region_code -0.0024356939 -0.0108809934 0.042007616 1.000000000
## policy_sales_channel 0.0001014691 -0.1128024119 -0.578551159 -0.041932309
## policy_sales_channel
## days_associated 0.0001014691
## annual_premium -0.1128024119
## age -0.5785511595
## region_code -0.0419323087
## policy_sales_channel 1.0000000000
df_recipe <- recipe(response ~.,
data = df_train) %>%
step_normalize(all_numeric()) %>%
step_dummy(all_nominal(), -all_outcomes())
df_recipe
# Train the recipe
df_train_prep <- df_recipe %>%
prep(training = df_train)
df_train_prep
df_train_processed <- df_train_prep %>%
bake(new_data = df_train)
df_train_processed %>% head()
df_test_processed <- df_train_prep %>%
bake(new_data = df_test)
df_test_processed %>% head()
Logistic Regression ✅
KNN ✅
Decision Tree ✅
Random Forest ✅
XGBoost ✅
In this very first moment we are going to fit the models and calculate a unique performance calculation. Afterwards, we are going to consider cross validation and hyperparameter tuning.
logistic_model <- logistic_reg() %>%
set_engine('glm') %>%
set_mode('classification')
start_time <- Sys.time()
logistic_fit <- logistic_model %>%
fit(response ~ .,
data = df_train)
end_time <- Sys.time()
# Calculate the elapsed time
elapsed_time <- end_time - start_time
print(paste0("Elapsed time to train a logsitc model: ", round(elapsed_time, 2), " seconds"))
## [1] "Elapsed time to train a logsitc model: 1.84 seconds"
class_preds <- logistic_fit %>%
predict(new_data = df_test,
type = 'class')
class_preds
prob_preds <- logistic_fit %>%
predict(new_data = df_test,
type = 'prob')
prob_preds
insurance_results <- df_test %>%
select(response) %>%
bind_cols(class_preds, prob_preds)
insurance_results
conf_mat(insurance_results,
truth = response,
estimate = .pred_class)
## Truth
## Prediction yes no
## yes 0 0
## no 11678 83600
custom_metrics <- metric_set(accuracy, precision, recall, f_meas)
lr_metrics <- custom_metrics(insurance_results,
truth = response,
estimate = .pred_class)
lr_metrics
# ROC Curve
insurance_results %>%
roc_curve(truth = response, .pred_yes) %>%
autoplot()
# Calculating AUC
roc_auc(insurance_results,
truth = response,
.pred_yes)
# Saving metrics results
lr_metrics <- pivot_wider(lr_metrics,
names_from = .metric,
values_from = .estimate) %>%
mutate(model = 'logistic regression') %>%
select(model, accuracy:f_meas)
lr_metrics
More info: https://parsnip.tidymodels.org/reference/nearest_neighbor.html
knn_model <- nearest_neighbor(weight_func = "rectangular",
neighbors = 3) %>%
set_mode("classification") %>%
set_engine("kknn")
start_time <- Sys.time()
library(kknn)
knn_fit <- knn_model %>%
fit(response ~ .,
data = df_train)
end_time <- Sys.time()
# Calculate the elapsed time
elapsed_time <- end_time - start_time
print(paste0("Elapsed time to train a knn model: ", round(elapsed_time, 2), " seconds"))
## [1] "Elapsed time to train a knn model: 10.21 seconds"
# Predictions
class_preds <- knn_fit %>%
predict(new_data = df_test,
type = 'class')
# Estimated Probabilities
prob_preds <- knn_fit %>%
predict(new_data = df_test,
type = 'prob')
# Combine results
knn_results <- df_test %>%
select(response) %>%
bind_cols(class_preds, prob_preds)
# Confusion Matrix
conf_mat(knn_results,
truth = response,
estimate = .pred_class)
## Truth
## Prediction yes no
## yes 2468 5507
## no 9210 78093
# Metrics
custom_metrics <- metric_set(accuracy, precision, recall, f_meas)
knn_metrics <- custom_metrics(knn_results,
truth = response,
estimate = .pred_class)
# Saving metrics results
knn_metrics <- pivot_wider(knn_metrics,
names_from = .metric,
values_from = .estimate) %>%
mutate(model = 'KNN') %>%
select(model, accuracy:f_meas)
knn_metrics
# Model Specification
dt_model <- decision_tree(tree_depth = 10) %>%
set_engine("rpart") %>%
set_mode("classification")
# Model fitting
start_time <- Sys.time()
dt_fit <- dt_model %>%
fit(response ~ .,
data = df_train)
end_time <- Sys.time()
# Calculate the elapsed time
elapsed_time <- end_time - start_time
print(paste0("Elapsed time to train a decision tree model: ", round(elapsed_time, 2)))
## [1] "Elapsed time to train a decision tree model: 0.69"
dt_fit
## parsnip model object
##
## n= 285831
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 285831 35032 no (0.1225619 0.8774381) *
library(rpart.plot)
dt_fit %>%
extract_fit_engine() %>%
rpart.plot()
# Predictions
class_preds <- dt_fit %>%
predict(new_data = df_test,
type = 'class')
# Estimated Probabilities
prob_preds <- dt_fit %>%
predict(new_data = df_test,
type = 'prob')
# Combine results
dt_results <- df_test %>%
select(response) %>%
bind_cols(class_preds, prob_preds)
# Confusion Matrix
conf_mat(dt_results,
truth = response,
estimate = .pred_class)
## Truth
## Prediction yes no
## yes 0 0
## no 11678 83600
# Metrics
custom_metrics <- metric_set(accuracy, precision, recall, f_meas)
dt_metrics <- custom_metrics(dt_results,
truth = response,
estimate = .pred_class)
# Saving metrics results
dt_metrics <- pivot_wider(dt_metrics,
names_from = .metric,
values_from = .estimate) %>%
mutate(model = 'decision tree') %>%
select(model, accuracy:f_meas)
dt_metrics
rf_model <- rand_forest(
mtry = 3,
trees = 1000,
min_n = 100
) %>%
set_mode("classification") %>%
set_engine("ranger")
start_time <- Sys.time()
library(ranger)
rf_fit <- rf_model %>%
fit(response ~ .,
data = df_train)
end_time <- Sys.time()
# Calculate the elapsed time
elapsed_time <- end_time - start_time
print(paste0("Elapsed time to train a random forest model: ", round(elapsed_time, 2), " seconds"))
## [1] "Elapsed time to train a random forest model: 7.39 seconds"
# Predictions
class_preds <- rf_fit %>%
predict(new_data = df_test,
type = 'class')
# Estimated Probabilities
prob_preds <- rf_fit %>%
predict(new_data = df_test,
type = 'prob')
# Combine results
rf_results <- df_test %>%
select(response) %>%
bind_cols(class_preds, prob_preds)
# Confusion Matrix
conf_mat(rf_results,
truth = response,
estimate = .pred_class)
## Truth
## Prediction yes no
## yes 148 159
## no 11530 83441
# Metrics
custom_metrics <- metric_set(accuracy, precision, recall, f_meas)
rf_metrics <- custom_metrics(rf_results,
truth = response,
estimate = .pred_class)
# Saving metrics results
rf_metrics <- pivot_wider(rf_metrics,
names_from = .metric,
values_from = .estimate) %>%
mutate(model = 'random forest') %>%
select(model, accuracy:f_meas)
rf_metrics
# Model
xgb_model <- boost_tree(
mtry = 1,
trees = 5,
min_n = 1000,
tree_depth = 4,
loss_reduction = 0,
sample_size = 1
) %>%
set_mode("classification") %>%
set_engine("xgboost",
cores = 1)
start_time <- Sys.time()
library(xgboost)
xgb_fit <- xgb_model %>%
fit(response ~ .,
data = df_train)
## [10:31:45] WARNING: src/learner.cc:767:
## Parameters: { "cores" } are not used.
end_time <- Sys.time()
# Calculate the elapsed time
elapsed_time <- end_time - start_time
print(paste0("Elapsed time to train a xgboost model: ", round(elapsed_time, 2), " seconds"))
## [1] "Elapsed time to train a xgboost model: 0.63 seconds"
# Predictions
class_preds <- xgb_fit %>%
predict(new_data = df_test,
type = 'class')
# Estimated Probabilities
prob_preds <- xgb_fit %>%
predict(new_data = df_test,
type = 'prob')
# Combine results
xgb_results <- df_test %>%
select(response) %>%
bind_cols(class_preds, prob_preds)
# Confusion Matrix
conf_mat(xgb_results,
truth = response,
estimate = .pred_class)
## Truth
## Prediction yes no
## yes 0 0
## no 11678 83600
# Metrics
custom_metrics <- metric_set(accuracy, precision, recall, f_meas)
xgb_metrics <- custom_metrics(xgb_results,
truth = response,
estimate = .pred_class)
# Saving metrics results
xgb_metrics <- pivot_wider(xgb_metrics,
names_from = .metric,
values_from = .estimate) %>%
mutate(model = 'xgb') %>%
select(model, accuracy:f_meas)
model_comparison_df <- rbind(lr_metrics,
knn_metrics,
dt_metrics,
rf_metrics,
xgb_metrics) %>%
arrange(desc(precision))
model_comparison_df