1 Imports

library(dplyr)
library(janitor)
library(skimr)
library(gtsummary)
library(summarytools)
library(kableExtra)
library(knitr)
library(ggplot2)
library(grid)
library(gridExtra)

2 Data Collection

library(readr)
df <- read_csv("train.csv")
glimpse(df)
## Rows: 381,109
## Columns: 12
## $ id                   <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
## $ Gender               <chr> "Male", "Male", "Male", "Male", "Female", "Female…
## $ Age                  <dbl> 44, 76, 47, 21, 29, 24, 23, 56, 24, 32, 47, 24, 4…
## $ Driving_License      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ Region_Code          <dbl> 28, 3, 28, 11, 41, 33, 11, 28, 3, 6, 35, 50, 15, …
## $ Previously_Insured   <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0…
## $ Vehicle_Age          <chr> "> 2 Years", "1-2 Year", "> 2 Years", "< 1 Year",…
## $ Vehicle_Damage       <chr> "Yes", "No", "Yes", "No", "No", "Yes", "Yes", "Ye…
## $ Annual_Premium       <dbl> 40454, 33536, 38294, 28619, 27496, 2630, 23367, 3…
## $ Policy_Sales_Channel <dbl> 26, 26, 26, 152, 152, 160, 152, 26, 152, 152, 124…
## $ Vintage              <dbl> 217, 183, 27, 203, 39, 176, 249, 72, 28, 80, 46, …
## $ Response             <dbl> 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0…

3 Data Cleaning

df1<-janitor::clean_names(df)%>% 
  rename(days_associated=vintage,
         health_annual_paid=annual_premium) %>% 
  mutate(
    across(where(is.character),tolower),
    driving_license=ifelse(driving_license==1,"yes","no"),
    previously_insured=ifelse(previously_insured==1,"yes","no"),
    response=ifelse(response==1,"yes","no"),
    vehicle_age=case_when(
      vehicle_age=="< 1 year" ~ "below_1_year",
      vehicle_age=="1-2 year" ~ "between_1_2_years",
      vehicle_age=="> 2 years" ~ "over_2_years"
    )
) %>% 
  mutate_if(is.character,as.factor)%>% 
  mutate(response=factor(response,levels=c('yes','no')),
         driving_license=factor(driving_license,levels=c('yes','no')),
         previously_insured=factor(previously_insured,levels=c('yes', 'no')),
         vehicle_damage=factor(vehicle_damage,levels=c('yes','no'))
         ) 
  

glimpse(df1)
## Rows: 381,109
## Columns: 12
## $ id                   <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
## $ gender               <fct> male, male, male, male, female, female, male, fem…
## $ age                  <dbl> 44, 76, 47, 21, 29, 24, 23, 56, 24, 32, 47, 24, 4…
## $ driving_license      <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes,…
## $ region_code          <dbl> 28, 3, 28, 11, 41, 33, 11, 28, 3, 6, 35, 50, 15, …
## $ previously_insured   <fct> no, no, no, yes, yes, no, no, no, yes, yes, no, y…
## $ vehicle_age          <fct> over_2_years, between_1_2_years, over_2_years, be…
## $ vehicle_damage       <fct> yes, no, yes, no, no, yes, yes, yes, no, no, yes,…
## $ health_annual_paid   <dbl> 40454, 33536, 38294, 28619, 27496, 2630, 23367, 3…
## $ policy_sales_channel <dbl> 26, 26, 26, 152, 152, 160, 152, 26, 152, 152, 124…
## $ days_associated      <dbl> 217, 183, 27, 203, 39, 176, 249, 72, 28, 80, 46, …
## $ response             <fct> yes, no, yes, no, no, no, no, yes, no, no, yes, n…
saveRDS(df1,'df_cleaned.rds')
df_cleaned <- readRDS('df_cleaned.rds')

3.1 Data Types

variable_classes<-tibble(variables=names(df1),
       type=unlist(lapply(df1,class)) )

variable_classes

4 Column Description

variables <- df1 %>% names()
description<-c(
  "Unique ID for the customer",
  "Gender of the customer",
  "Age of the customer",
  "Customer has DL(yes/no)",
  "Unique code for the region of the customer",
  "Customer already has Vehicle Insurance(yes/no)",
  "Age of the Vehicle",
  "Customer got his/her vehicle damaged in the past.(yes/no)",
  "The amount customer needs to pay as premium in the year",
  "Anonymized Code for the channel of outreaching to the customer ie. Different Agents, Over Mail, Over Phone, In Person, etc.",
  "Number of Days, Customer has been associated with the company",
  "Customer is interested in car insurance(yes/no)"
)

df_description<-tibble(variables=variables,
       description=description)

kable(df_description,format="html") %>%
  kable_styling(bootstrap_options = 'striped',full_width = FALSE)
variables description
id Unique ID for the customer
gender Gender of the customer
age Age of the customer
driving_license Customer has DL(yes/no)
region_code Unique code for the region of the customer
previously_insured Customer already has Vehicle Insurance(yes/no)
vehicle_age Age of the Vehicle
vehicle_damage Customer got his/her vehicle damaged in the past.(yes/no)
health_annual_paid The amount customer needs to pay as premium in the year
policy_sales_channel Anonymized Code for the channel of outreaching to the customer ie. Different Agents, Over Mail, Over Phone, In Person, etc.
days_associated Number of Days, Customer has been associated with the company
response Customer is interested in car insurance(yes/no)

5 Estatistica Descritiva

  • Check data
skim(df_cleaned)
Data summary
Name df_cleaned
Number of rows 381109
Number of columns 12
_______________________
Column type frequency:
factor 6
numeric 6
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
gender 0 1 FALSE 2 mal: 206089, fem: 175020
driving_license 0 1 FALSE 2 yes: 380297, no: 812
previously_insured 0 1 FALSE 2 no: 206481, yes: 174628
vehicle_age 0 1 FALSE 3 bet: 200316, bel: 164786, ove: 16007
vehicle_damage 0 1 FALSE 2 yes: 192413, no: 188696
response 0 1 FALSE 2 no: 334399, yes: 46710

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1 190555.00 110016.84 1 95278 190555 285832 381109 ▇▇▇▇▇
age 0 1 38.82 15.51 20 25 36 49 85 ▇▃▃▂▁
region_code 0 1 26.39 13.23 0 15 28 35 52 ▃▂▇▃▃
health_annual_paid 0 1 30564.39 17213.16 2630 24405 31669 39400 540165 ▇▁▁▁▁
policy_sales_channel 0 1 112.03 54.20 1 29 133 152 163 ▅▁▁▃▇
days_associated 0 1 154.35 83.67 10 82 154 227 299 ▇▇▇▇▇
summary(df_cleaned)
##        id            gender            age        driving_license
##  Min.   :     1   female:175020   Min.   :20.00   yes:380297     
##  1st Qu.: 95278   male  :206089   1st Qu.:25.00   no :   812     
##  Median :190555                   Median :36.00                  
##  Mean   :190555                   Mean   :38.82                  
##  3rd Qu.:285832                   3rd Qu.:49.00                  
##  Max.   :381109                   Max.   :85.00                  
##   region_code    previously_insured            vehicle_age     vehicle_damage
##  Min.   : 0.00   yes:174628         below_1_year     :164786   yes:192413    
##  1st Qu.:15.00   no :206481         between_1_2_years:200316   no :188696    
##  Median :28.00                      over_2_years     : 16007                 
##  Mean   :26.39                                                               
##  3rd Qu.:35.00                                                               
##  Max.   :52.00                                                               
##  health_annual_paid policy_sales_channel days_associated response    
##  Min.   :  2630     Min.   :  1          Min.   : 10.0   yes: 46710  
##  1st Qu.: 24405     1st Qu.: 29          1st Qu.: 82.0   no :334399  
##  Median : 31669     Median :133          Median :154.0               
##  Mean   : 30564     Mean   :112          Mean   :154.3               
##  3rd Qu.: 39400     3rd Qu.:152          3rd Qu.:227.0               
##  Max.   :540165     Max.   :163          Max.   :299.0

6 General Overview

library(gtsummary)

df_cleaned %>% 
  select(-id) %>% 
  tbl_summary(
    type = list(response ~ 'categorical',
                driving_license ~ 'categorical',
                previously_insured ~ 'categorical',
                vehicle_damage ~ 'categorical'),
    digits = list(all_categorical() ~ c(0,2))
  )
Characteristic N = 381,1091
gender
    female 175,020 (45.92%)
    male 206,089 (54.08%)
age 36 (25, 49)
driving_license
    yes 380,297 (99.79%)
    no 812 (0.21%)
region_code 28 (15, 35)
previously_insured
    yes 174,628 (45.82%)
    no 206,481 (54.18%)
vehicle_age
    below_1_year 164,786 (43.24%)
    between_1_2_years 200,316 (52.56%)
    over_2_years 16,007 (4.20%)
vehicle_damage
    yes 192,413 (50.49%)
    no 188,696 (49.51%)
health_annual_paid 31,669 (24,405, 39,400)
policy_sales_channel 133 (29, 152)
days_associated 154 (82, 227)
response
    yes 46,710 (12.26%)
    no 334,399 (87.74%)
1 n (%); Median (Q1, Q3)

7 Mais detalhes estatisticos

num_atributos<-df_cleaned %>%
  select(age,health_annual_paid,days_associated)
tab_descr<-descr(num_atributos,style = 'rmarkdown') %>% round(2)

kable(data.frame(tab_descr),format="html") %>%
  kable_styling(bootstrap_options = 'striped',full_width = FALSE)
age days_associated health_annual_paid
Mean 38.82 154.35 30564.39
Std.Dev 15.51 83.67 17213.16
Min 20.00 10.00 2630.00
Q1 25.00 82.00 24405.00
Median 36.00 154.00 31669.00
Q3 49.00 227.00 39400.00
Max 85.00 299.00 540165.00
MAD 17.79 108.23 11125.43
IQR 24.00 145.00 14995.00
CV 0.40 0.54 0.56
Skewness 0.67 0.00 1.77
SE.Skewness 0.00 0.00 0.00
Kurtosis -0.57 -1.20 34.00
N.Valid 381109.00 381109.00 381109.00
Pct.Valid 100.00 100.00 100.00

8 Visualização

  • Numérico
options(encoding = "UTF-8")
Sys.setlocale("LC_ALL", "pt_BR.UTF-8")
## [1] "LC_COLLATE=pt_BR.UTF-8;LC_CTYPE=pt_BR.UTF-8;LC_MONETARY=pt_BR.UTF-8;LC_NUMERIC=C;LC_TIME=pt_BR.UTF-8"
#--------------age--------------
age_plt<-num_atributos %>% 
  ggplot(aes(x=age))+
  geom_histogram(aes(y=after_stat(density)),binwidth=1,
                 color='gray',fill='lightblue',alpha=0.5)+
  geom_density(color='blue')+
  labs(x='idade',y='densidade',title='idade_cliente')+
  theme_minimal()

#--------------health_annual_paid--------------
paid_plt<-num_atributos %>% 
  ggplot(aes(x=health_annual_paid))+
  geom_histogram(aes(y=after_stat(density)),binwidth = 10000,
                 color='gray',fill='lightblue',alpha=0.5)+
  geom_density(color='blue')+
  labs(x='saúde_anual_pago',y='densidade',title='distribuição de \nclientes')+
  theme_minimal()

#--------------days_associated--------------
days_plt<-num_atributos %>% 
  ggplot(aes(x=days_associated))+
  geom_histogram(aes(y=after_stat(density)),
                 color='gray',fill='lightblue',alpha=0.5)+
  geom_density(color='blue')+
  labs(x='dias_associados',y='densidade',title='distribuição de clientes \npor dia')+
  theme_minimal()

grid.arrange(age_plt,paid_plt,days_plt,ncol=3)

  • Categórico

    num_names<-names(num_atributos)
    atributo_cat<-df_cleaned %>% 
      select(-id,-one_of(num_names))
#---------gender----------
gender_plt<-atributo_cat %>% 
  ggplot(aes(x=gender))+
  geom_bar(aes(fill=gender),
           show.legend = FALSE)+
  labs(x='Genero',y='#',title='distribuição \nde genero ')+
  theme_minimal()


#---------driving_license----------
permisson_plt<-atributo_cat %>% 
  ggplot(aes(x=driving_license))+
  geom_bar(aes(fill=driving_license),
           show.legend = FALSE)+
  labs(x='permissão para dirigir',y='#',title='permissão \npara dirigir')+
  theme_minimal()


#---------region_cod----------
region_plt<-atributo_cat %>% 
  ggplot(aes(x=region_code))+
  geom_bar(aes(fill=factor(region_code)),
           show.legend = FALSE)+
  labs(x='região',y='#',title='distribuição \npor região')+
  theme_minimal()

#---------previously_insured----------
previous_plt<-atributo_cat %>% 
  ggplot(aes(x=previously_insured))+
  geom_bar(aes(fill=previously_insured),
           show.legend = FALSE)+
  labs(x='anteriormente assegurados',y='#',title='distribuição de \nanteriormente assegurados')+
  theme_minimal()

#---------vehicle_age----------
vehicle_plt<-atributo_cat %>% 
  ggplot(aes(x=vehicle_age))+
  geom_bar(aes(fill=vehicle_age),
           show.legend = FALSE)+
  labs(x='idade do veiculo',y='#',title='distribuição por \nidade do veiculo')+
  theme_minimal()


#---------vehicle_damagee----------
damage_plt<-atributo_cat %>% 
  ggplot(aes(x=vehicle_damage))+
  geom_bar(aes(fill=vehicle_damage),
           show.legend = FALSE)+
  labs(x='dano no veiculo',y='#',title='clientes que tiveram \ndano no veiculo')+
  theme_minimal()

#---------policy_sales_channel----------
channel_plt<-atributo_cat %>% 
  ggplot(aes(x=policy_sales_channel))+
  geom_bar(aes(fill=factor(policy_sales_channel)),
           show.legend = FALSE)+
  labs(x='canal de vendas',y='#',title='distribuição por \ncanal de vendas')+
  theme_minimal()

#---------response----------
response_plt<-atributo_cat %>% 
  ggplot(aes(x=response))+
  geom_bar(aes(fill=response),
           show.legend = FALSE)+
  labs(x='resposta',y='#',title='distribuição por \nresposta')+
  theme_minimal()

grid.arrange(gender_plt,permisson_plt,region_plt,previous_plt,vehicle_plt,damage_plt,channel_plt,response_plt,ncol=4,nrow=2)

9 Validação de Hipóteses

9.1 H1- O interesse pelo seguro de carro é maior em clientes mais velhos. ✅

# boxplot
age_boxplot<-df_cleaned %>% 
  ggplot(aes(x=response,y=age))+
  stat_boxplot(geom='errorbar',width=0.6)+
  geom_boxplot(aes(fill=response),show.legend = FALSE)+
  labs(title = 'comparação por idade',y='age',x="response")+
  theme_bw()
  
age_boxplot

#histograma
age_plot<-df_cleaned %>% 
  ggplot(aes(x= age))+
  geom_histogram(binwidth=1,color='gray',fill='navy')+
  facet_wrap(vars(response),nrow=2,scales="free_y")+
  labs(y='numero de clientes')+
  ggtitle("distribuição por idade")

age_plot

df_cleaned %>% 
  select(age,response) %>% 
  tbl_summary(by=response)
Characteristic yes
N = 46,710
1
no
N = 334,399
1
age 43 (35, 51) 34 (24, 49)
1 Median (Q1, Q3)

Pessoas jovesns estão mais desinteressadas pelo seguro de carro. A mediana da idade dos clientes interessados é de 43 anos (IQR:35,51), e a dos desinteressados é de 34 anos (IQR:24,49)

9.1.1 H2-O interesse pelo seguro de carro é maior entre as mulheres. ❌

gender_plot<-df_cleaned %>% 
  select(response,gender) %>% 
  ggplot(aes(x=response))+
  geom_bar(aes(fill=gender),position='dodge')+
  labs(title = 'genero vs resposta', x='genero', y='numero_clientes')+
  theme_bw()

gender_plot

df_cleaned %>% 
  select(response,gender) %>% 
  tbl_summary(by=response)
Characteristic yes
N = 46,710
1
no
N = 334,399
1
gender

    female 18,185 (39%) 156,835 (47%)
    male 28,525 (61%) 177,564 (53%)
1 n (%)

Na verdades os homens são os mais interessados.

9.1.2 H3-O interesse pelo seguro de carro é maior em carros mais novos. ❌

car_age_plot<-df_cleaned %>% 
  select(response,vehicle_age) %>% 
  ggplot(aes(x=response))+
  geom_bar(aes(fill=vehicle_age),position = 'dodge')+
  labs(title = 'idade_veiculo vs resposta',x='resposta',y='numeros_clientes')+
  theme_bw()

car_age_plot

Na verdade, a maioria das respostas sim, são para clientes que tem carros de 1 a 2 anos, localizados em uma faixa intermediaria de classificação.

df_cleaned %>% 
  select(response,vehicle_age) %>% 
  tbl_summary(by=response)
Characteristic yes
N = 46,710
1
no
N = 334,399
1
vehicle_age

    below_1_year 7,202 (15%) 157,584 (47%)
    between_1_2_years 34,806 (75%) 165,510 (49%)
    over_2_years 4,702 (10%) 11,305 (3.4%)
1 n (%)

9.1.3 H4-O interesse pelo seguro de carro é maior para usuários que tiveram danos anteriores no veículo. ✅

damage_car_plot<-df_cleaned %>% 
  select(response,vehicle_damage) %>% 
  ggplot(aes(x=response))+
  geom_bar(aes(fill=vehicle_damage),position = 'dodge')+
  labs(title = 'dano_veiculo vs resposta',x='resposta',y='numeros_clientes')+
  theme_bw()

damage_car_plot

df_cleaned %>% 
  select(response,vehicle_damage) %>% 
  tbl_summary(by=response)
Characteristic yes
N = 46,710
1
no
N = 334,399
1
vehicle_damage 45,728 (98%) 146,685 (44%)
1 n (%)

De fato, interesse pelo seguro de carro é maior para usuários que tiveram danos anteriores no veículo.

9.1.4 H5- O interesse é maior para quem teve carro previamente assegurado: ❌

car_insured_plot<-df_cleaned %>% 
  select(response,previously_insured) %>% 
  ggplot(aes(x=response))+
  geom_bar(aes(fill=previously_insured),position = 'dodge')+
  labs(title = 'previamente_assegurados vs respostas',x='resposta',y='numeros_clientes')+
  theme_bw()

car_insured_plot

df_cleaned %>% 
  select(response,previously_insured) %>% 
  tbl_summary(by=response)
Characteristic yes
N = 46,710
1
no
N = 334,399
1
previously_insured 158 (0.3%) 174,470 (52%)
1 n (%)

Somente 0.3% das pessoas que tinham o carro previamente assegurados disseram sim.

9.1.5 H6-O interesse pelo seguro de carro é maior em quem gasta mais no seguro saúde. ❌

# boxplot
expenses_boxplot<-df_cleaned %>% 
  ggplot(aes(x=response,y=health_annual_paid))+
  stat_boxplot(geom='errorbar',width=0.6)+
  geom_boxplot(aes(fill=response),show.legend = FALSE)+
  labs(title = 'gasto_anual vs interesse',y='gasto_anual',x="response")+
  theme_bw()
  
expenses_boxplot

#histograma
expenses_plot<-df_cleaned %>% 
  ggplot(aes(x=health_annual_paid))+
  geom_histogram(binwidth=10000,color='gray',fill='navy')+
  facet_wrap(vars(response),nrow=2,scales="free_y")+
  labs(y='numero de clientes')+
  ggtitle("distribuição")

expenses_plot

df_cleaned %>% 
  select(response,health_annual_paid) %>% 
  tbl_summary(by=response)
Characteristic yes
N = 46,710
1
no
N = 334,399
1
health_annual_paid 33,002 (24,868, 41,297) 31,504 (24,351, 39,120)
1 Median (Q1, Q3)

9.1.6 H7-O interesse pelo seguro de carro é maior em quem está há mais tempo no seguro saúde: ❌

days_boxplot<-df_cleaned %>% 
  ggplot(aes(x=response,y=days_associated))+
  stat_boxplot(geom='errorbar',width=0.6)+
  geom_boxplot(aes(fill=response),show.legend = FALSE)+
  labs(title = 'dias_associados vs resposta',y='dias',x="resposta")+
  theme_bw()
  
days_boxplot

#histograma
days_plot<-df_cleaned %>% 
  ggplot(aes(x=days_associated))+
  geom_histogram(binwidth=30,color='gray',fill='navy')+
  facet_wrap(vars(response),nrow=2,scales="free_y")+
  labs(y='numero de clientes')+
  ggtitle("distribuição")

days_plot

df_cleaned %>% 
  select(response,days_associated) %>% 
  tbl_summary(by=response)
Characteristic yes
N = 46,710
1
no
N = 334,399
1
days_associated 154 (82, 226) 154 (82, 227)
1 Median (Q1, Q3)

Falso, pois os números são equilibrados.

9.2 Conclusão de Hipóteses

hypothesis<-c(
 " H1)O interesse pelo seguro de carro é maior em clientes mais velhos.",
  "H2)-O interesse pelo seguro de carro é maior entre as mulheres.",
  "H3)O interesse pelo seguro de carro é maior em carros mais novos.",
  "H4)O interesse pelo seguro de carro é maior para usuários que tiveram danos anteriores no veículo.",
  "H5)O interesse é maior para quem teve carro previamente assegurado.",
  "H6)O interesse pelo seguro de carro é maior em quem gasta mais no seguro saúde.",
  "H7)O interesse pelo seguro de carro é maior em quem está há mais tempo no seguro saúde.")

conclusion<-c(
  'True',
  'False',
  'False',
  'True',
  'False',
  'False',
  'False'
)

relevance<-c(
  'high',
  'medium',
  'high',
  'high',
  'high',
  'low',
  'low'
)

tab_hip<-tibble(
  hypothesis=hypothesis,
  conclusion=conclusion,
  relevance=relevance
)

kable(tab_hip,format="html") %>%
  kable_styling(bootstrap_options = 'striped',full_width = FALSE)
hypothesis conclusion relevance
H1)O interesse pelo seguro de carro é maior em clientes mais velhos. True high
H2)-O interesse pelo seguro de carro é maior entre as mulheres. False medium
H3)O interesse pelo seguro de carro é maior em carros mais novos. False high
H4)O interesse pelo seguro de carro é maior para usuários que tiveram danos anteriores no veículo. True high
H5)O interesse é maior para quem teve carro previamente assegurado. False high
H6)O interesse pelo seguro de carro é maior em quem gasta mais no seguro saúde. False low
H7)O interesse pelo seguro de carro é maior em quem está há mais tempo no seguro saúde. False low