library(dplyr)
library(janitor)
library(skimr)
library(gtsummary)
library(summarytools)
library(kableExtra)
library(knitr)
library(ggplot2)
library(grid)
library(gridExtra)
library(readr)
df <- read_csv("train.csv")
glimpse(df)
## Rows: 381,109
## Columns: 12
## $ id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
## $ Gender <chr> "Male", "Male", "Male", "Male", "Female", "Female…
## $ Age <dbl> 44, 76, 47, 21, 29, 24, 23, 56, 24, 32, 47, 24, 4…
## $ Driving_License <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ Region_Code <dbl> 28, 3, 28, 11, 41, 33, 11, 28, 3, 6, 35, 50, 15, …
## $ Previously_Insured <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0…
## $ Vehicle_Age <chr> "> 2 Years", "1-2 Year", "> 2 Years", "< 1 Year",…
## $ Vehicle_Damage <chr> "Yes", "No", "Yes", "No", "No", "Yes", "Yes", "Ye…
## $ Annual_Premium <dbl> 40454, 33536, 38294, 28619, 27496, 2630, 23367, 3…
## $ Policy_Sales_Channel <dbl> 26, 26, 26, 152, 152, 160, 152, 26, 152, 152, 124…
## $ Vintage <dbl> 217, 183, 27, 203, 39, 176, 249, 72, 28, 80, 46, …
## $ Response <dbl> 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0…
df1<-janitor::clean_names(df)%>%
rename(days_associated=vintage,
health_annual_paid=annual_premium) %>%
mutate(
across(where(is.character),tolower),
driving_license=ifelse(driving_license==1,"yes","no"),
previously_insured=ifelse(previously_insured==1,"yes","no"),
response=ifelse(response==1,"yes","no"),
vehicle_age=case_when(
vehicle_age=="< 1 year" ~ "below_1_year",
vehicle_age=="1-2 year" ~ "between_1_2_years",
vehicle_age=="> 2 years" ~ "over_2_years"
)
) %>%
mutate_if(is.character,as.factor)%>%
mutate(response=factor(response,levels=c('yes','no')),
driving_license=factor(driving_license,levels=c('yes','no')),
previously_insured=factor(previously_insured,levels=c('yes', 'no')),
vehicle_damage=factor(vehicle_damage,levels=c('yes','no'))
)
glimpse(df1)
## Rows: 381,109
## Columns: 12
## $ id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
## $ gender <fct> male, male, male, male, female, female, male, fem…
## $ age <dbl> 44, 76, 47, 21, 29, 24, 23, 56, 24, 32, 47, 24, 4…
## $ driving_license <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes,…
## $ region_code <dbl> 28, 3, 28, 11, 41, 33, 11, 28, 3, 6, 35, 50, 15, …
## $ previously_insured <fct> no, no, no, yes, yes, no, no, no, yes, yes, no, y…
## $ vehicle_age <fct> over_2_years, between_1_2_years, over_2_years, be…
## $ vehicle_damage <fct> yes, no, yes, no, no, yes, yes, yes, no, no, yes,…
## $ health_annual_paid <dbl> 40454, 33536, 38294, 28619, 27496, 2630, 23367, 3…
## $ policy_sales_channel <dbl> 26, 26, 26, 152, 152, 160, 152, 26, 152, 152, 124…
## $ days_associated <dbl> 217, 183, 27, 203, 39, 176, 249, 72, 28, 80, 46, …
## $ response <fct> yes, no, yes, no, no, no, no, yes, no, no, yes, n…
saveRDS(df1,'df_cleaned.rds')
df_cleaned <- readRDS('df_cleaned.rds')
variable_classes<-tibble(variables=names(df1),
type=unlist(lapply(df1,class)) )
variable_classes
variables <- df1 %>% names()
description<-c(
"Unique ID for the customer",
"Gender of the customer",
"Age of the customer",
"Customer has DL(yes/no)",
"Unique code for the region of the customer",
"Customer already has Vehicle Insurance(yes/no)",
"Age of the Vehicle",
"Customer got his/her vehicle damaged in the past.(yes/no)",
"The amount customer needs to pay as premium in the year",
"Anonymized Code for the channel of outreaching to the customer ie. Different Agents, Over Mail, Over Phone, In Person, etc.",
"Number of Days, Customer has been associated with the company",
"Customer is interested in car insurance(yes/no)"
)
df_description<-tibble(variables=variables,
description=description)
kable(df_description,format="html") %>%
kable_styling(bootstrap_options = 'striped',full_width = FALSE)
| variables | description |
|---|---|
| id | Unique ID for the customer |
| gender | Gender of the customer |
| age | Age of the customer |
| driving_license | Customer has DL(yes/no) |
| region_code | Unique code for the region of the customer |
| previously_insured | Customer already has Vehicle Insurance(yes/no) |
| vehicle_age | Age of the Vehicle |
| vehicle_damage | Customer got his/her vehicle damaged in the past.(yes/no) |
| health_annual_paid | The amount customer needs to pay as premium in the year |
| policy_sales_channel | Anonymized Code for the channel of outreaching to the customer ie. Different Agents, Over Mail, Over Phone, In Person, etc. |
| days_associated | Number of Days, Customer has been associated with the company |
| response | Customer is interested in car insurance(yes/no) |
skim(df_cleaned)
| Name | df_cleaned |
| Number of rows | 381109 |
| Number of columns | 12 |
| _______________________ | |
| Column type frequency: | |
| factor | 6 |
| numeric | 6 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| gender | 0 | 1 | FALSE | 2 | mal: 206089, fem: 175020 |
| driving_license | 0 | 1 | FALSE | 2 | yes: 380297, no: 812 |
| previously_insured | 0 | 1 | FALSE | 2 | no: 206481, yes: 174628 |
| vehicle_age | 0 | 1 | FALSE | 3 | bet: 200316, bel: 164786, ove: 16007 |
| vehicle_damage | 0 | 1 | FALSE | 2 | yes: 192413, no: 188696 |
| response | 0 | 1 | FALSE | 2 | no: 334399, yes: 46710 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| id | 0 | 1 | 190555.00 | 110016.84 | 1 | 95278 | 190555 | 285832 | 381109 | ▇▇▇▇▇ |
| age | 0 | 1 | 38.82 | 15.51 | 20 | 25 | 36 | 49 | 85 | ▇▃▃▂▁ |
| region_code | 0 | 1 | 26.39 | 13.23 | 0 | 15 | 28 | 35 | 52 | ▃▂▇▃▃ |
| health_annual_paid | 0 | 1 | 30564.39 | 17213.16 | 2630 | 24405 | 31669 | 39400 | 540165 | ▇▁▁▁▁ |
| policy_sales_channel | 0 | 1 | 112.03 | 54.20 | 1 | 29 | 133 | 152 | 163 | ▅▁▁▃▇ |
| days_associated | 0 | 1 | 154.35 | 83.67 | 10 | 82 | 154 | 227 | 299 | ▇▇▇▇▇ |
summary(df_cleaned)
## id gender age driving_license
## Min. : 1 female:175020 Min. :20.00 yes:380297
## 1st Qu.: 95278 male :206089 1st Qu.:25.00 no : 812
## Median :190555 Median :36.00
## Mean :190555 Mean :38.82
## 3rd Qu.:285832 3rd Qu.:49.00
## Max. :381109 Max. :85.00
## region_code previously_insured vehicle_age vehicle_damage
## Min. : 0.00 yes:174628 below_1_year :164786 yes:192413
## 1st Qu.:15.00 no :206481 between_1_2_years:200316 no :188696
## Median :28.00 over_2_years : 16007
## Mean :26.39
## 3rd Qu.:35.00
## Max. :52.00
## health_annual_paid policy_sales_channel days_associated response
## Min. : 2630 Min. : 1 Min. : 10.0 yes: 46710
## 1st Qu.: 24405 1st Qu.: 29 1st Qu.: 82.0 no :334399
## Median : 31669 Median :133 Median :154.0
## Mean : 30564 Mean :112 Mean :154.3
## 3rd Qu.: 39400 3rd Qu.:152 3rd Qu.:227.0
## Max. :540165 Max. :163 Max. :299.0
library(gtsummary)
df_cleaned %>%
select(-id) %>%
tbl_summary(
type = list(response ~ 'categorical',
driving_license ~ 'categorical',
previously_insured ~ 'categorical',
vehicle_damage ~ 'categorical'),
digits = list(all_categorical() ~ c(0,2))
)
| Characteristic | N = 381,1091 |
|---|---|
| gender | |
| female | 175,020 (45.92%) |
| male | 206,089 (54.08%) |
| age | 36 (25, 49) |
| driving_license | |
| yes | 380,297 (99.79%) |
| no | 812 (0.21%) |
| region_code | 28 (15, 35) |
| previously_insured | |
| yes | 174,628 (45.82%) |
| no | 206,481 (54.18%) |
| vehicle_age | |
| below_1_year | 164,786 (43.24%) |
| between_1_2_years | 200,316 (52.56%) |
| over_2_years | 16,007 (4.20%) |
| vehicle_damage | |
| yes | 192,413 (50.49%) |
| no | 188,696 (49.51%) |
| health_annual_paid | 31,669 (24,405, 39,400) |
| policy_sales_channel | 133 (29, 152) |
| days_associated | 154 (82, 227) |
| response | |
| yes | 46,710 (12.26%) |
| no | 334,399 (87.74%) |
| 1 n (%); Median (Q1, Q3) | |
num_atributos<-df_cleaned %>%
select(age,health_annual_paid,days_associated)
tab_descr<-descr(num_atributos,style = 'rmarkdown') %>% round(2)
kable(data.frame(tab_descr),format="html") %>%
kable_styling(bootstrap_options = 'striped',full_width = FALSE)
| age | days_associated | health_annual_paid | |
|---|---|---|---|
| Mean | 38.82 | 154.35 | 30564.39 |
| Std.Dev | 15.51 | 83.67 | 17213.16 |
| Min | 20.00 | 10.00 | 2630.00 |
| Q1 | 25.00 | 82.00 | 24405.00 |
| Median | 36.00 | 154.00 | 31669.00 |
| Q3 | 49.00 | 227.00 | 39400.00 |
| Max | 85.00 | 299.00 | 540165.00 |
| MAD | 17.79 | 108.23 | 11125.43 |
| IQR | 24.00 | 145.00 | 14995.00 |
| CV | 0.40 | 0.54 | 0.56 |
| Skewness | 0.67 | 0.00 | 1.77 |
| SE.Skewness | 0.00 | 0.00 | 0.00 |
| Kurtosis | -0.57 | -1.20 | 34.00 |
| N.Valid | 381109.00 | 381109.00 | 381109.00 |
| Pct.Valid | 100.00 | 100.00 | 100.00 |
options(encoding = "UTF-8")
Sys.setlocale("LC_ALL", "pt_BR.UTF-8")
## [1] "LC_COLLATE=pt_BR.UTF-8;LC_CTYPE=pt_BR.UTF-8;LC_MONETARY=pt_BR.UTF-8;LC_NUMERIC=C;LC_TIME=pt_BR.UTF-8"
#--------------age--------------
age_plt<-num_atributos %>%
ggplot(aes(x=age))+
geom_histogram(aes(y=after_stat(density)),binwidth=1,
color='gray',fill='lightblue',alpha=0.5)+
geom_density(color='blue')+
labs(x='idade',y='densidade',title='idade_cliente')+
theme_minimal()
#--------------health_annual_paid--------------
paid_plt<-num_atributos %>%
ggplot(aes(x=health_annual_paid))+
geom_histogram(aes(y=after_stat(density)),binwidth = 10000,
color='gray',fill='lightblue',alpha=0.5)+
geom_density(color='blue')+
labs(x='saúde_anual_pago',y='densidade',title='distribuição de \nclientes')+
theme_minimal()
#--------------days_associated--------------
days_plt<-num_atributos %>%
ggplot(aes(x=days_associated))+
geom_histogram(aes(y=after_stat(density)),
color='gray',fill='lightblue',alpha=0.5)+
geom_density(color='blue')+
labs(x='dias_associados',y='densidade',title='distribuição de clientes \npor dia')+
theme_minimal()
grid.arrange(age_plt,paid_plt,days_plt,ncol=3)
Categórico
num_names<-names(num_atributos)
atributo_cat<-df_cleaned %>%
select(-id,-one_of(num_names))#---------gender----------
gender_plt<-atributo_cat %>%
ggplot(aes(x=gender))+
geom_bar(aes(fill=gender),
show.legend = FALSE)+
labs(x='Genero',y='#',title='distribuição \nde genero ')+
theme_minimal()
#---------driving_license----------
permisson_plt<-atributo_cat %>%
ggplot(aes(x=driving_license))+
geom_bar(aes(fill=driving_license),
show.legend = FALSE)+
labs(x='permissão para dirigir',y='#',title='permissão \npara dirigir')+
theme_minimal()
#---------region_cod----------
region_plt<-atributo_cat %>%
ggplot(aes(x=region_code))+
geom_bar(aes(fill=factor(region_code)),
show.legend = FALSE)+
labs(x='região',y='#',title='distribuição \npor região')+
theme_minimal()
#---------previously_insured----------
previous_plt<-atributo_cat %>%
ggplot(aes(x=previously_insured))+
geom_bar(aes(fill=previously_insured),
show.legend = FALSE)+
labs(x='anteriormente assegurados',y='#',title='distribuição de \nanteriormente assegurados')+
theme_minimal()
#---------vehicle_age----------
vehicle_plt<-atributo_cat %>%
ggplot(aes(x=vehicle_age))+
geom_bar(aes(fill=vehicle_age),
show.legend = FALSE)+
labs(x='idade do veiculo',y='#',title='distribuição por \nidade do veiculo')+
theme_minimal()
#---------vehicle_damagee----------
damage_plt<-atributo_cat %>%
ggplot(aes(x=vehicle_damage))+
geom_bar(aes(fill=vehicle_damage),
show.legend = FALSE)+
labs(x='dano no veiculo',y='#',title='clientes que tiveram \ndano no veiculo')+
theme_minimal()
#---------policy_sales_channel----------
channel_plt<-atributo_cat %>%
ggplot(aes(x=policy_sales_channel))+
geom_bar(aes(fill=factor(policy_sales_channel)),
show.legend = FALSE)+
labs(x='canal de vendas',y='#',title='distribuição por \ncanal de vendas')+
theme_minimal()
#---------response----------
response_plt<-atributo_cat %>%
ggplot(aes(x=response))+
geom_bar(aes(fill=response),
show.legend = FALSE)+
labs(x='resposta',y='#',title='distribuição por \nresposta')+
theme_minimal()
grid.arrange(gender_plt,permisson_plt,region_plt,previous_plt,vehicle_plt,damage_plt,channel_plt,response_plt,ncol=4,nrow=2)
# boxplot
age_boxplot<-df_cleaned %>%
ggplot(aes(x=response,y=age))+
stat_boxplot(geom='errorbar',width=0.6)+
geom_boxplot(aes(fill=response),show.legend = FALSE)+
labs(title = 'comparação por idade',y='age',x="response")+
theme_bw()
age_boxplot
#histograma
age_plot<-df_cleaned %>%
ggplot(aes(x= age))+
geom_histogram(binwidth=1,color='gray',fill='navy')+
facet_wrap(vars(response),nrow=2,scales="free_y")+
labs(y='numero de clientes')+
ggtitle("distribuição por idade")
age_plot
df_cleaned %>%
select(age,response) %>%
tbl_summary(by=response)
| Characteristic | yes N = 46,7101 |
no N = 334,3991 |
|---|---|---|
| age | 43 (35, 51) | 34 (24, 49) |
| 1 Median (Q1, Q3) | ||
Pessoas jovesns estão mais desinteressadas pelo seguro de carro. A mediana da idade dos clientes interessados é de 43 anos (IQR:35,51), e a dos desinteressados é de 34 anos (IQR:24,49)
gender_plot<-df_cleaned %>%
select(response,gender) %>%
ggplot(aes(x=response))+
geom_bar(aes(fill=gender),position='dodge')+
labs(title = 'genero vs resposta', x='genero', y='numero_clientes')+
theme_bw()
gender_plot
df_cleaned %>%
select(response,gender) %>%
tbl_summary(by=response)
| Characteristic | yes N = 46,7101 |
no N = 334,3991 |
|---|---|---|
| gender | ||
| female | 18,185 (39%) | 156,835 (47%) |
| male | 28,525 (61%) | 177,564 (53%) |
| 1 n (%) | ||
Na verdades os homens são os mais interessados.
car_age_plot<-df_cleaned %>%
select(response,vehicle_age) %>%
ggplot(aes(x=response))+
geom_bar(aes(fill=vehicle_age),position = 'dodge')+
labs(title = 'idade_veiculo vs resposta',x='resposta',y='numeros_clientes')+
theme_bw()
car_age_plot
Na verdade, a maioria das respostas sim, são para clientes que tem carros de 1 a 2 anos, localizados em uma faixa intermediaria de classificação.
df_cleaned %>%
select(response,vehicle_age) %>%
tbl_summary(by=response)
| Characteristic | yes N = 46,7101 |
no N = 334,3991 |
|---|---|---|
| vehicle_age | ||
| below_1_year | 7,202 (15%) | 157,584 (47%) |
| between_1_2_years | 34,806 (75%) | 165,510 (49%) |
| over_2_years | 4,702 (10%) | 11,305 (3.4%) |
| 1 n (%) | ||
damage_car_plot<-df_cleaned %>%
select(response,vehicle_damage) %>%
ggplot(aes(x=response))+
geom_bar(aes(fill=vehicle_damage),position = 'dodge')+
labs(title = 'dano_veiculo vs resposta',x='resposta',y='numeros_clientes')+
theme_bw()
damage_car_plot
df_cleaned %>%
select(response,vehicle_damage) %>%
tbl_summary(by=response)
| Characteristic | yes N = 46,7101 |
no N = 334,3991 |
|---|---|---|
| vehicle_damage | 45,728 (98%) | 146,685 (44%) |
| 1 n (%) | ||
De fato, interesse pelo seguro de carro é maior para usuários que tiveram danos anteriores no veículo.
car_insured_plot<-df_cleaned %>%
select(response,previously_insured) %>%
ggplot(aes(x=response))+
geom_bar(aes(fill=previously_insured),position = 'dodge')+
labs(title = 'previamente_assegurados vs respostas',x='resposta',y='numeros_clientes')+
theme_bw()
car_insured_plot
df_cleaned %>%
select(response,previously_insured) %>%
tbl_summary(by=response)
| Characteristic | yes N = 46,7101 |
no N = 334,3991 |
|---|---|---|
| previously_insured | 158 (0.3%) | 174,470 (52%) |
| 1 n (%) | ||
Somente 0.3% das pessoas que tinham o carro previamente assegurados disseram sim.
# boxplot
expenses_boxplot<-df_cleaned %>%
ggplot(aes(x=response,y=health_annual_paid))+
stat_boxplot(geom='errorbar',width=0.6)+
geom_boxplot(aes(fill=response),show.legend = FALSE)+
labs(title = 'gasto_anual vs interesse',y='gasto_anual',x="response")+
theme_bw()
expenses_boxplot
#histograma
expenses_plot<-df_cleaned %>%
ggplot(aes(x=health_annual_paid))+
geom_histogram(binwidth=10000,color='gray',fill='navy')+
facet_wrap(vars(response),nrow=2,scales="free_y")+
labs(y='numero de clientes')+
ggtitle("distribuição")
expenses_plot
df_cleaned %>%
select(response,health_annual_paid) %>%
tbl_summary(by=response)
| Characteristic | yes N = 46,7101 |
no N = 334,3991 |
|---|---|---|
| health_annual_paid | 33,002 (24,868, 41,297) | 31,504 (24,351, 39,120) |
| 1 Median (Q1, Q3) | ||
days_boxplot<-df_cleaned %>%
ggplot(aes(x=response,y=days_associated))+
stat_boxplot(geom='errorbar',width=0.6)+
geom_boxplot(aes(fill=response),show.legend = FALSE)+
labs(title = 'dias_associados vs resposta',y='dias',x="resposta")+
theme_bw()
days_boxplot
#histograma
days_plot<-df_cleaned %>%
ggplot(aes(x=days_associated))+
geom_histogram(binwidth=30,color='gray',fill='navy')+
facet_wrap(vars(response),nrow=2,scales="free_y")+
labs(y='numero de clientes')+
ggtitle("distribuição")
days_plot
df_cleaned %>%
select(response,days_associated) %>%
tbl_summary(by=response)
| Characteristic | yes N = 46,7101 |
no N = 334,3991 |
|---|---|---|
| days_associated | 154 (82, 226) | 154 (82, 227) |
| 1 Median (Q1, Q3) | ||
Falso, pois os números são equilibrados.
hypothesis<-c(
" H1)O interesse pelo seguro de carro é maior em clientes mais velhos.",
"H2)-O interesse pelo seguro de carro é maior entre as mulheres.",
"H3)O interesse pelo seguro de carro é maior em carros mais novos.",
"H4)O interesse pelo seguro de carro é maior para usuários que tiveram danos anteriores no veículo.",
"H5)O interesse é maior para quem teve carro previamente assegurado.",
"H6)O interesse pelo seguro de carro é maior em quem gasta mais no seguro saúde.",
"H7)O interesse pelo seguro de carro é maior em quem está há mais tempo no seguro saúde.")
conclusion<-c(
'True',
'False',
'False',
'True',
'False',
'False',
'False'
)
relevance<-c(
'high',
'medium',
'high',
'high',
'high',
'low',
'low'
)
tab_hip<-tibble(
hypothesis=hypothesis,
conclusion=conclusion,
relevance=relevance
)
kable(tab_hip,format="html") %>%
kable_styling(bootstrap_options = 'striped',full_width = FALSE)
| hypothesis | conclusion | relevance |
|---|---|---|
| H1)O interesse pelo seguro de carro é maior em clientes mais velhos. | True | high |
| H2)-O interesse pelo seguro de carro é maior entre as mulheres. | False | medium |
| H3)O interesse pelo seguro de carro é maior em carros mais novos. | False | high |
| H4)O interesse pelo seguro de carro é maior para usuários que tiveram danos anteriores no veículo. | True | high |
| H5)O interesse é maior para quem teve carro previamente assegurado. | False | high |
| H6)O interesse pelo seguro de carro é maior em quem gasta mais no seguro saúde. | False | low |
| H7)O interesse pelo seguro de carro é maior em quem está há mais tempo no seguro saúde. | False | low |