1 Imports

library(dplyr)
library(janitor)
library(skimr)
library(gtsummary)
library(summarytools)
library(kableExtra)
library(knitr)
library(ggplot2)
library(grid)
library(gridExtra)

2 Data Collection

library(readr)
df <- read_csv("train.csv")
glimpse(df)

## Rows: 381,109
## Columns: 12
## $ id                   <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
## $ Gender               <chr> "Male", "Male", "Male", "Male", "Female", "Female…
## $ Age                  <dbl> 44, 76, 47, 21, 29, 24, 23, 56, 24, 32, 47, 24, 4…
## $ Driving_License      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ Region_Code          <dbl> 28, 3, 28, 11, 41, 33, 11, 28, 3, 6, 35, 50, 15, …
## $ Previously_Insured   <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0…
## $ Vehicle_Age          <chr> "> 2 Years", "1-2 Year", "> 2 Years", "< 1 Year",…
## $ Vehicle_Damage       <chr> "Yes", "No", "Yes", "No", "No", "Yes", "Yes", "Ye…
## $ Annual_Premium       <dbl> 40454, 33536, 38294, 28619, 27496, 2630, 23367, 3…
## $ Policy_Sales_Channel <dbl> 26, 26, 26, 152, 152, 160, 152, 26, 152, 152, 124…
## $ Vintage              <dbl> 217, 183, 27, 203, 39, 176, 249, 72, 28, 80, 46, …
## $ Response             <dbl> 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0…

3 Data Cleaning

df1<-janitor::clean_names(df)%>% 
  rename(days_associated=vintage,
         health_annual_paid=annual_premium) %>% 
  mutate(
    across(where(is.character),tolower),
    driving_license=ifelse(driving_license==1,"yes","no"),
    previously_insured=ifelse(previously_insured==1,"yes","no"),
    response=ifelse(response==1,"yes","no"),
    vehicle_age=case_when(
      vehicle_age=="< 1 year" ~ "below_1_year",
      vehicle_age=="1-2 year" ~ "between_1_2_years",
      vehicle_age=="> 2 years" ~ "over_2_years"
    )
) %>% 
  mutate_if(is.character,as.factor)%>% 
  mutate(response=factor(response,levels=c('yes','no')),
         driving_license=factor(driving_license,levels=c('yes','no')),
         previously_insured=factor(previously_insured,levels=c('yes', 'no')),
         vehicle_damage=factor(vehicle_damage,levels=c('yes','no'))
         ) 
  

glimpse(df1)

## Rows: 381,109
## Columns: 12
## $ id                   <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
## $ gender               <fct> male, male, male, male, female, female, male, fem…
## $ age                  <dbl> 44, 76, 47, 21, 29, 24, 23, 56, 24, 32, 47, 24, 4…
## $ driving_license      <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes,…
## $ region_code          <dbl> 28, 3, 28, 11, 41, 33, 11, 28, 3, 6, 35, 50, 15, …
## $ previously_insured   <fct> no, no, no, yes, yes, no, no, no, yes, yes, no, y…
## $ vehicle_age          <fct> over_2_years, between_1_2_years, over_2_years, be…
## $ vehicle_damage       <fct> yes, no, yes, no, no, yes, yes, yes, no, no, yes,…
## $ health_annual_paid   <dbl> 40454, 33536, 38294, 28619, 27496, 2630, 23367, 3…
## $ policy_sales_channel <dbl> 26, 26, 26, 152, 152, 160, 152, 26, 152, 152, 124…
## $ days_associated      <dbl> 217, 183, 27, 203, 39, 176, 249, 72, 28, 80, 46, …
## $ response             <fct> yes, no, yes, no, no, no, no, yes, no, no, yes, n…

saveRDS(df1,'df_cleaned.rds')

df_cleaned <- readRDS('df_cleaned.rds')

3.1 Data Types

variable_classes<-tibble(variables=names(df1),
       type=unlist(lapply(df1,class)) )

variable_classes

4 Column Description

variables <- df1 %>% names()
description<-c(
  "Unique ID for the customer",
  "Gender of the customer",
  "Age of the customer",
  "Customer has DL(yes/no)",
  "Unique code for the region of the customer",
  "Customer already has Vehicle Insurance(yes/no)",
  "Age of the Vehicle",
  "Customer got his/her vehicle damaged in the past.(yes/no)",
  "The amount customer needs to pay as premium in the year",
  "Anonymized Code for the channel of outreaching to the customer ie. Different Agents, Over Mail, Over Phone, In Person, etc.",
  "Number of Days, Customer has been associated with the company",
  "Customer is interested in car insurance(yes/no)"
)

df_description<-tibble(variables=variables,
       description=description)

kable(df_description,format="html") %>%
  kable_styling(bootstrap_options = 'striped',full_width = FALSE)

variables	description
id	Unique ID for the customer
gender	Gender of the customer
age	Age of the customer
driving_license	Customer has DL(yes/no)
region_code	Unique code for the region of the customer
previously_insured	Customer already has Vehicle Insurance(yes/no)
vehicle_age	Age of the Vehicle
vehicle_damage	Customer got his/her vehicle damaged in the past.(yes/no)
health_annual_paid	The amount customer needs to pay as premium in the year
policy_sales_channel	Anonymized Code for the channel of outreaching to the customer ie. Different Agents, Over Mail, Over Phone, In Person, etc.
days_associated	Number of Days, Customer has been associated with the company
response	Customer is interested in car insurance(yes/no)

5 Estatistica Descritiva

Check data

skim(df_cleaned)

Data summary
Name	df_cleaned
Number of rows	381109
Number of columns	12
_______________________
Column type frequency:
factor	6
numeric	6
________________________
Group variables	None

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
gender	1	FALSE	2	mal: 206089, fem: 175020
driving_license	1	FALSE	2	yes: 380297, no: 812
previously_insured	1	FALSE	2	no: 206481, yes: 174628
vehicle_age	1	FALSE	3	bet: 200316, bel: 164786, ove: 16007
vehicle_damage	1	FALSE	2	yes: 192413, no: 188696
response	1	FALSE	2	no: 334399, yes: 46710

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
id	1	190555.00	110016.84	1	95278	190555	285832	381109	▇▇▇▇▇
age	1	38.82	15.51	20	25	36	49	85	▇▃▃▂▁
region_code	1	26.39	13.23	0	15	28	35	52	▃▂▇▃▃
health_annual_paid	1	30564.39	17213.16	2630	24405	31669	39400	540165	▇▁▁▁▁
policy_sales_channel	1	112.03	54.20	1	29	133	152	163	▅▁▁▃▇
days_associated	1	154.35	83.67	10	82	154	227	299	▇▇▇▇▇

summary(df_cleaned)

##        id            gender            age        driving_license
##  Min.   :     1   female:175020   Min.   :20.00   yes:380297     
##  1st Qu.: 95278   male  :206089   1st Qu.:25.00   no :   812     
##  Median :190555                   Median :36.00                  
##  Mean   :190555                   Mean   :38.82                  
##  3rd Qu.:285832                   3rd Qu.:49.00                  
##  Max.   :381109                   Max.   :85.00                  
##   region_code    previously_insured            vehicle_age     vehicle_damage
##  Min.   : 0.00   yes:174628         below_1_year     :164786   yes:192413    
##  1st Qu.:15.00   no :206481         between_1_2_years:200316   no :188696    
##  Median :28.00                      over_2_years     : 16007                 
##  Mean   :26.39                                                               
##  3rd Qu.:35.00                                                               
##  Max.   :52.00                                                               
##  health_annual_paid policy_sales_channel days_associated response    
##  Min.   :  2630     Min.   :  1          Min.   : 10.0   yes: 46710  
##  1st Qu.: 24405     1st Qu.: 29          1st Qu.: 82.0   no :334399  
##  Median : 31669     Median :133          Median :154.0               
##  Mean   : 30564     Mean   :112          Mean   :154.3               
##  3rd Qu.: 39400     3rd Qu.:152          3rd Qu.:227.0               
##  Max.   :540165     Max.   :163          Max.   :299.0

6 General Overview

library(gtsummary)

df_cleaned %>% 
  select(-id) %>% 
  tbl_summary(
    type = list(response ~ 'categorical',
                driving_license ~ 'categorical',
                previously_insured ~ 'categorical',
                vehicle_damage ~ 'categorical'),
    digits = list(all_categorical() ~ c(0,2))
  )

Characteristic	N = 381,109¹
gender
female	175,020 (45.92%)
male	206,089 (54.08%)
age	36 (25, 49)
driving_license
yes	380,297 (99.79%)
no	812 (0.21%)
region_code	28 (15, 35)
previously_insured
yes	174,628 (45.82%)
no	206,481 (54.18%)
vehicle_age
below_1_year	164,786 (43.24%)
between_1_2_years	200,316 (52.56%)
over_2_years	16,007 (4.20%)
vehicle_damage
yes	192,413 (50.49%)
no	188,696 (49.51%)
health_annual_paid	31,669 (24,405, 39,400)
policy_sales_channel	133 (29, 152)
days_associated	154 (82, 227)
response
yes	46,710 (12.26%)
no	334,399 (87.74%)
¹ n (%); Median (Q1, Q3)

7 Mais detalhes estatisticos

num_atributos<-df_cleaned %>%
  select(age,health_annual_paid,days_associated)

tab_descr<-descr(num_atributos,style = 'rmarkdown') %>% round(2)

kable(data.frame(tab_descr),format="html") %>%
  kable_styling(bootstrap_options = 'striped',full_width = FALSE)

	age	days_associated	health_annual_paid
Mean	38.82	154.35	30564.39
Std.Dev	15.51	83.67	17213.16
Min	20.00	10.00	2630.00
Q1	25.00	82.00	24405.00
Median	36.00	154.00	31669.00
Q3	49.00	227.00	39400.00
Max	85.00	299.00	540165.00
MAD	17.79	108.23	11125.43
IQR	24.00	145.00	14995.00
CV	0.40	0.54	0.56
Skewness	0.67	0.00	1.77
SE.Skewness	0.00	0.00	0.00
Kurtosis	-0.57	-1.20	34.00
N.Valid	381109.00	381109.00	381109.00
Pct.Valid	100.00	100.00	100.00

8 Visualização

Numérico

options(encoding = "UTF-8")
Sys.setlocale("LC_ALL", "pt_BR.UTF-8")

## [1] "LC_COLLATE=pt_BR.UTF-8;LC_CTYPE=pt_BR.UTF-8;LC_MONETARY=pt_BR.UTF-8;LC_NUMERIC=C;LC_TIME=pt_BR.UTF-8"

#--------------age--------------
age_plt<-num_atributos %>% 
  ggplot(aes(x=age))+
  geom_histogram(aes(y=after_stat(density)),binwidth=1,
                 color='gray',fill='lightblue',alpha=0.5)+
  geom_density(color='blue')+
  labs(x='idade',y='densidade',title='idade_cliente')+
  theme_minimal()

#--------------health_annual_paid--------------
paid_plt<-num_atributos %>% 
  ggplot(aes(x=health_annual_paid))+
  geom_histogram(aes(y=after_stat(density)),binwidth = 10000,
                 color='gray',fill='lightblue',alpha=0.5)+
  geom_density(color='blue')+
  labs(x='saúde_anual_pago',y='densidade',title='distribuição de \nclientes')+
  theme_minimal()

#--------------days_associated--------------
days_plt<-num_atributos %>% 
  ggplot(aes(x=days_associated))+
  geom_histogram(aes(y=after_stat(density)),
                 color='gray',fill='lightblue',alpha=0.5)+
  geom_density(color='blue')+
  labs(x='dias_associados',y='densidade',title='distribuição de clientes \npor dia')+
  theme_minimal()

grid.arrange(age_plt,paid_plt,days_plt,ncol=3)

Categórico

num_names<-names(num_atributos)
atributo_cat<-df_cleaned %>% 
  select(-id,-one_of(num_names))

#---------gender----------
gender_plt<-atributo_cat %>% 
  ggplot(aes(x=gender))+
  geom_bar(aes(fill=gender),
           show.legend = FALSE)+
  labs(x='Genero',y='#',title='distribuição \nde genero ')+
  theme_minimal()


#---------driving_license----------
permisson_plt<-atributo_cat %>% 
  ggplot(aes(x=driving_license))+
  geom_bar(aes(fill=driving_license),
           show.legend = FALSE)+
  labs(x='permissão para dirigir',y='#',title='permissão \npara dirigir')+
  theme_minimal()


#---------region_cod----------
region_plt<-atributo_cat %>% 
  ggplot(aes(x=region_code))+
  geom_bar(aes(fill=factor(region_code)),
           show.legend = FALSE)+
  labs(x='região',y='#',title='distribuição \npor região')+
  theme_minimal()

#---------previously_insured----------
previous_plt<-atributo_cat %>% 
  ggplot(aes(x=previously_insured))+
  geom_bar(aes(fill=previously_insured),
           show.legend = FALSE)+
  labs(x='anteriormente assegurados',y='#',title='distribuição de \nanteriormente assegurados')+
  theme_minimal()

#---------vehicle_age----------
vehicle_plt<-atributo_cat %>% 
  ggplot(aes(x=vehicle_age))+
  geom_bar(aes(fill=vehicle_age),
           show.legend = FALSE)+
  labs(x='idade do veiculo',y='#',title='distribuição por \nidade do veiculo')+
  theme_minimal()


#---------vehicle_damagee----------
damage_plt<-atributo_cat %>% 
  ggplot(aes(x=vehicle_damage))+
  geom_bar(aes(fill=vehicle_damage),
           show.legend = FALSE)+
  labs(x='dano no veiculo',y='#',title='clientes que tiveram \ndano no veiculo')+
  theme_minimal()

#---------policy_sales_channel----------
channel_plt<-atributo_cat %>% 
  ggplot(aes(x=policy_sales_channel))+
  geom_bar(aes(fill=factor(policy_sales_channel)),
           show.legend = FALSE)+
  labs(x='canal de vendas',y='#',title='distribuição por \ncanal de vendas')+
  theme_minimal()

#---------response----------
response_plt<-atributo_cat %>% 
  ggplot(aes(x=response))+
  geom_bar(aes(fill=response),
           show.legend = FALSE)+
  labs(x='resposta',y='#',title='distribuição por \nresposta')+
  theme_minimal()

grid.arrange(gender_plt,permisson_plt,region_plt,previous_plt,vehicle_plt,damage_plt,channel_plt,response_plt,ncol=4,nrow=2)

9 Validação de Hipóteses

9.1 H1- O interesse pelo seguro de carro é maior em clientes mais velhos. ✅

# boxplot
age_boxplot<-df_cleaned %>% 
  ggplot(aes(x=response,y=age))+
  stat_boxplot(geom='errorbar',width=0.6)+
  geom_boxplot(aes(fill=response),show.legend = FALSE)+
  labs(title = 'comparação por idade',y='age',x="response")+
  theme_bw()
  
age_boxplot

#histograma
age_plot<-df_cleaned %>% 
  ggplot(aes(x= age))+
  geom_histogram(binwidth=1,color='gray',fill='navy')+
  facet_wrap(vars(response),nrow=2,scales="free_y")+
  labs(y='numero de clientes')+
  ggtitle("distribuição por idade")

age_plot

df_cleaned %>% 
  select(age,response) %>% 
  tbl_summary(by=response)

Characteristic	yes N = 46,710¹	no N = 334,399¹
age	43 (35, 51)	34 (24, 49)
¹ Median (Q1, Q3)

Pessoas jovesns estão mais desinteressadas pelo seguro de carro. A mediana da idade dos clientes interessados é de 43 anos (IQR:35,51), e a dos desinteressados é de 34 anos (IQR:24,49)

9.1.1 H2-O interesse pelo seguro de carro é maior entre as mulheres. ❌

gender_plot<-df_cleaned %>% 
  select(response,gender) %>% 
  ggplot(aes(x=response))+
  geom_bar(aes(fill=gender),position='dodge')+
  labs(title = 'genero vs resposta', x='genero', y='numero_clientes')+
  theme_bw()

gender_plot

df_cleaned %>% 
  select(response,gender) %>% 
  tbl_summary(by=response)

Characteristic	yes N = 46,710¹	no N = 334,399¹
gender
female	18,185 (39%)	156,835 (47%)
male	28,525 (61%)	177,564 (53%)
¹ n (%)

Na verdades os homens são os mais interessados.

9.1.2 H3-O interesse pelo seguro de carro é maior em carros mais novos. ❌

car_age_plot<-df_cleaned %>% 
  select(response,vehicle_age) %>% 
  ggplot(aes(x=response))+
  geom_bar(aes(fill=vehicle_age),position = 'dodge')+
  labs(title = 'idade_veiculo vs resposta',x='resposta',y='numeros_clientes')+
  theme_bw()

car_age_plot

Na verdade, a maioria das respostas sim, são para clientes que tem carros de 1 a 2 anos, localizados em uma faixa intermediaria de classificação.

df_cleaned %>% 
  select(response,vehicle_age) %>% 
  tbl_summary(by=response)

Characteristic	yes N = 46,710¹	no N = 334,399¹
vehicle_age
below_1_year	7,202 (15%)	157,584 (47%)
between_1_2_years	34,806 (75%)	165,510 (49%)
over_2_years	4,702 (10%)	11,305 (3.4%)
¹ n (%)

9.1.3 H4-O interesse pelo seguro de carro é maior para usuários que tiveram danos anteriores no veículo. ✅

damage_car_plot<-df_cleaned %>% 
  select(response,vehicle_damage) %>% 
  ggplot(aes(x=response))+
  geom_bar(aes(fill=vehicle_damage),position = 'dodge')+
  labs(title = 'dano_veiculo vs resposta',x='resposta',y='numeros_clientes')+
  theme_bw()

damage_car_plot

df_cleaned %>% 
  select(response,vehicle_damage) %>% 
  tbl_summary(by=response)

Characteristic	yes N = 46,710¹	no N = 334,399¹
vehicle_damage	45,728 (98%)	146,685 (44%)
¹ n (%)

De fato, interesse pelo seguro de carro é maior para usuários que tiveram danos anteriores no veículo.

9.1.4 H5- O interesse é maior para quem teve carro previamente assegurado: ❌

car_insured_plot<-df_cleaned %>% 
  select(response,previously_insured) %>% 
  ggplot(aes(x=response))+
  geom_bar(aes(fill=previously_insured),position = 'dodge')+
  labs(title = 'previamente_assegurados vs respostas',x='resposta',y='numeros_clientes')+
  theme_bw()

car_insured_plot

df_cleaned %>% 
  select(response,previously_insured) %>% 
  tbl_summary(by=response)

Characteristic	yes N = 46,710¹	no N = 334,399¹
previously_insured	158 (0.3%)	174,470 (52%)
¹ n (%)

Somente 0.3% das pessoas que tinham o carro previamente assegurados disseram sim.

9.1.5 H6-O interesse pelo seguro de carro é maior em quem gasta mais no seguro saúde. ❌

# boxplot
expenses_boxplot<-df_cleaned %>% 
  ggplot(aes(x=response,y=health_annual_paid))+
  stat_boxplot(geom='errorbar',width=0.6)+
  geom_boxplot(aes(fill=response),show.legend = FALSE)+
  labs(title = 'gasto_anual vs interesse',y='gasto_anual',x="response")+
  theme_bw()
  
expenses_boxplot

#histograma
expenses_plot<-df_cleaned %>% 
  ggplot(aes(x=health_annual_paid))+
  geom_histogram(binwidth=10000,color='gray',fill='navy')+
  facet_wrap(vars(response),nrow=2,scales="free_y")+
  labs(y='numero de clientes')+
  ggtitle("distribuição")

expenses_plot

df_cleaned %>% 
  select(response,health_annual_paid) %>% 
  tbl_summary(by=response)

Characteristic	yes N = 46,710¹	no N = 334,399¹
health_annual_paid	33,002 (24,868, 41,297)	31,504 (24,351, 39,120)
¹ Median (Q1, Q3)

9.1.6 H7-O interesse pelo seguro de carro é maior em quem está há mais tempo no seguro saúde: ❌

days_boxplot<-df_cleaned %>% 
  ggplot(aes(x=response,y=days_associated))+
  stat_boxplot(geom='errorbar',width=0.6)+
  geom_boxplot(aes(fill=response),show.legend = FALSE)+
  labs(title = 'dias_associados vs resposta',y='dias',x="resposta")+
  theme_bw()
  
days_boxplot

#histograma
days_plot<-df_cleaned %>% 
  ggplot(aes(x=days_associated))+
  geom_histogram(binwidth=30,color='gray',fill='navy')+
  facet_wrap(vars(response),nrow=2,scales="free_y")+
  labs(y='numero de clientes')+
  ggtitle("distribuição")

days_plot

df_cleaned %>% 
  select(response,days_associated) %>% 
  tbl_summary(by=response)

Characteristic	yes N = 46,710¹	no N = 334,399¹
days_associated	154 (82, 226)	154 (82, 227)
¹ Median (Q1, Q3)

Falso, pois os números são equilibrados.

9.2 Conclusão de Hipóteses

hypothesis<-c(
 " H1)O interesse pelo seguro de carro é maior em clientes mais velhos.",
  "H2)-O interesse pelo seguro de carro é maior entre as mulheres.",
  "H3)O interesse pelo seguro de carro é maior em carros mais novos.",
  "H4)O interesse pelo seguro de carro é maior para usuários que tiveram danos anteriores no veículo.",
  "H5)O interesse é maior para quem teve carro previamente assegurado.",
  "H6)O interesse pelo seguro de carro é maior em quem gasta mais no seguro saúde.",
  "H7)O interesse pelo seguro de carro é maior em quem está há mais tempo no seguro saúde.")

conclusion<-c(
  'True',
  'False',
  'False',
  'True',
  'False',
  'False',
  'False'
)

relevance<-c(
  'high',
  'medium',
  'high',
  'high',
  'high',
  'low',
  'low'
)

tab_hip<-tibble(
  hypothesis=hypothesis,
  conclusion=conclusion,
  relevance=relevance
)

kable(tab_hip,format="html") %>%
  kable_styling(bootstrap_options = 'striped',full_width = FALSE)

hypothesis	conclusion	relevance
H1)O interesse pelo seguro de carro é maior em clientes mais velhos.	True	high
H2)-O interesse pelo seguro de carro é maior entre as mulheres.	False	medium
H3)O interesse pelo seguro de carro é maior em carros mais novos.	False	high
H4)O interesse pelo seguro de carro é maior para usuários que tiveram danos anteriores no veículo.	True	high
H5)O interesse é maior para quem teve carro previamente assegurado.	False	high
H6)O interesse pelo seguro de carro é maior em quem gasta mais no seguro saúde.	False	low
H7)O interesse pelo seguro de carro é maior em quem está há mais tempo no seguro saúde.	False	low

Data Science with R

Lucas Leal

20/01/2025