Imports
library(tidyverse)
library(janitor)
library(gtsummary)
library(summarytools)
library(kableExtra)
library(knitr)
Data Collection
df <- read_csv("01.data/train.csv")
glimpse(df)
## Rows: 381,109
## Columns: 12
## $ id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
## $ Gender <chr> "Male", "Male", "Male", "Male", "Female", "Female…
## $ Age <dbl> 44, 76, 47, 21, 29, 24, 23, 56, 24, 32, 47, 24, 4…
## $ Driving_License <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ Region_Code <dbl> 28, 3, 28, 11, 41, 33, 11, 28, 3, 6, 35, 50, 15, …
## $ Previously_Insured <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0…
## $ Vehicle_Age <chr> "> 2 Years", "1-2 Year", "> 2 Years", "< 1 Year",…
## $ Vehicle_Damage <chr> "Yes", "No", "Yes", "No", "No", "Yes", "Yes", "Ye…
## $ Annual_Premium <dbl> 40454, 33536, 38294, 28619, 27496, 2630, 23367, 3…
## $ Policy_Sales_Channel <dbl> 26, 26, 26, 152, 152, 160, 152, 26, 152, 152, 124…
## $ Vintage <dbl> 217, 183, 27, 203, 39, 176, 249, 72, 28, 80, 46, …
## $ Response <dbl> 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0…
Data Cleaning
df1 <- janitor::clean_names(df) %>%
rename(
days_associated = vintage,
health_annual_paid = annual_premium
) %>%
mutate(
across(where(is.character), tolower),
driving_license = ifelse(driving_license == 1, "yes", "no"),
previously_insured = ifelse(previously_insured == 1, "yes", "no"),
response = ifelse(response == 1, "yes", "no"),
vehicle_age = case_when(
vehicle_age == "< 1 year" ~ "below_1_year",
vehicle_age == "1-2 year" ~ "between_1_2_years",
vehicle_age == "> 2 years" ~ "over_2_years"
)
) %>%
mutate_if(is.character, as.factor) %>%
# changing yes/no levels
mutate(
response = factor(response, levels = c("yes", "no")),
driving_license = factor(driving_license, levels = c("yes", "no")),
previously_insured = factor(previously_insured, levels = c("yes", "no")),
vehicle_damage = factor(vehicle_damage, levels = c("yes", "no"))
)
glimpse(df1)
## Rows: 381,109
## Columns: 12
## $ id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
## $ gender <fct> male, male, male, male, female, female, male, fem…
## $ age <dbl> 44, 76, 47, 21, 29, 24, 23, 56, 24, 32, 47, 24, 4…
## $ driving_license <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes,…
## $ region_code <dbl> 28, 3, 28, 11, 41, 33, 11, 28, 3, 6, 35, 50, 15, …
## $ previously_insured <fct> no, no, no, yes, yes, no, no, no, yes, yes, no, y…
## $ vehicle_age <fct> over_2_years, between_1_2_years, over_2_years, be…
## $ vehicle_damage <fct> yes, no, yes, no, no, yes, yes, yes, no, no, yes,…
## $ health_annual_paid <dbl> 40454, 33536, 38294, 28619, 27496, 2630, 23367, 3…
## $ policy_sales_channel <dbl> 26, 26, 26, 152, 152, 160, 152, 26, 152, 152, 124…
## $ days_associated <dbl> 217, 183, 27, 203, 39, 176, 249, 72, 28, 80, 46, …
## $ response <fct> yes, no, yes, no, no, no, no, yes, no, no, yes, n…
# save df_cleaned as RDS
saveRDS(df1, "df_cleaned.rds")
Data Types
variable_calsses <- tibble(variables = names(df1),
types = unlist(lapply(df1, class))
)
variable_calsses
Column Description
variables <- df1 %>% names()
description <- c(
"Unique ID for the customer",
"Gender of the customer",
"Age of the customer",
"Customer does not have DL (yes/no)",
"Unique code for the region of the customer",
"Customer already has Vehicle Insurance (yes/no)",
"Age of the Vehicle",
"Customer got his/her vehicle damaged in the past (yes/no)",
"The amount customer needs to pay as premium in the year",
"Anonymized Code for the channel of outreaching to the customer ie. Different Agents, Over Mail, Over Phone, In Person, etc",
"Number of Days, Customer has been associated with the company",
"Customer is interested in car insurance (yes/no)"
)
df_description <- tibble(variables = variables, description = description)
kable(df_description, format = "html") %>%
kableExtra::kable_styling(bootstrap_options = "striped", full_width = F)
|
variables
|
description
|
|
id
|
Unique ID for the customer
|
|
gender
|
Gender of the customer
|
|
age
|
Age of the customer
|
|
driving_license
|
Customer does not have DL (yes/no)
|
|
region_code
|
Unique code for the region of the customer
|
|
previously_insured
|
Customer already has Vehicle Insurance (yes/no)
|
|
vehicle_age
|
Age of the Vehicle
|
|
vehicle_damage
|
Customer got his/her vehicle damaged in the past (yes/no)
|
|
health_annual_paid
|
The amount customer needs to pay as premium in the year
|
|
policy_sales_channel
|
Anonymized Code for the channel of outreaching to the customer ie.
Different Agents, Over Mail, Over Phone, In Person, etc
|
|
days_associated
|
Number of Days, Customer has been associated with the company
|
|
response
|
Customer is interested in car insurance (yes/no)
|
Descriptive
Statistics
# read cleaned data
df_cleaned <- readRDS("df_cleaned.rds")
glimpse(df_cleaned)
## Rows: 381,109
## Columns: 12
## $ id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
## $ gender <fct> male, male, male, male, female, female, male, fem…
## $ age <dbl> 44, 76, 47, 21, 29, 24, 23, 56, 24, 32, 47, 24, 4…
## $ driving_license <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes,…
## $ region_code <dbl> 28, 3, 28, 11, 41, 33, 11, 28, 3, 6, 35, 50, 15, …
## $ previously_insured <fct> no, no, no, yes, yes, no, no, no, yes, yes, no, y…
## $ vehicle_age <fct> over_2_years, between_1_2_years, over_2_years, be…
## $ vehicle_damage <fct> yes, no, yes, no, no, yes, yes, yes, no, no, yes,…
## $ health_annual_paid <dbl> 40454, 33536, 38294, 28619, 27496, 2630, 23367, 3…
## $ policy_sales_channel <dbl> 26, 26, 26, 152, 152, 160, 152, 26, 152, 152, 124…
## $ days_associated <dbl> 217, 183, 27, 203, 39, 176, 249, 72, 28, 80, 46, …
## $ response <fct> yes, no, yes, no, no, no, no, yes, no, no, yes, n…
- Check Data Structure so far:
skimr::skim(df_cleaned)
Data summary
| Name |
df_cleaned |
| Number of rows |
381109 |
| Number of columns |
12 |
| _______________________ |
|
| Column type frequency: |
|
| factor |
6 |
| numeric |
6 |
| ________________________ |
|
| Group variables |
None |
Variable type: factor
| gender |
0 |
1 |
FALSE |
2 |
mal: 206089, fem: 175020 |
| driving_license |
0 |
1 |
FALSE |
2 |
yes: 380297, no: 812 |
| previously_insured |
0 |
1 |
FALSE |
2 |
no: 206481, yes: 174628 |
| vehicle_age |
0 |
1 |
FALSE |
3 |
bet: 200316, bel: 164786, ove: 16007 |
| vehicle_damage |
0 |
1 |
FALSE |
2 |
yes: 192413, no: 188696 |
| response |
0 |
1 |
FALSE |
2 |
no: 334399, yes: 46710 |
Variable type: numeric
| id |
0 |
1 |
190555.00 |
110016.84 |
1 |
95278 |
190555 |
285832 |
381109 |
▇▇▇▇▇ |
| age |
0 |
1 |
38.82 |
15.51 |
20 |
25 |
36 |
49 |
85 |
▇▃▃▂▁ |
| region_code |
0 |
1 |
26.39 |
13.23 |
0 |
15 |
28 |
35 |
52 |
▃▂▇▃▃ |
| health_annual_paid |
0 |
1 |
30564.39 |
17213.16 |
2630 |
24405 |
31669 |
39400 |
540165 |
▇▁▁▁▁ |
| policy_sales_channel |
0 |
1 |
112.03 |
54.20 |
1 |
29 |
133 |
152 |
163 |
▅▁▁▃▇ |
| days_associated |
0 |
1 |
154.35 |
83.67 |
10 |
82 |
154 |
227 |
299 |
▇▇▇▇▇ |
General Overview
df_cleaned %>%
select(-id) %>%
tbl_summary(
type = list(response ~ "categorical",
driving_license ~ "categorical",
previously_insured ~ "categorical",
vehicle_damage ~ "categorical"),
digits = list(all_categorical() ~ c(0, 2))
)
| Characteristic |
N = 381,109 |
| gender |
|
| female |
175,020 (45.92%) |
| male |
206,089 (54.08%) |
| age |
36 (25, 49) |
| driving_license |
|
| yes |
380,297 (99.79%) |
| no |
812 (0.21%) |
| region_code |
28 (15, 35) |
| previously_insured |
|
| yes |
174,628 (45.82%) |
| no |
206,481 (54.18%) |
| vehicle_age |
|
| below_1_year |
164,786 (43.24%) |
| between_1_2_years |
200,316 (52.56%) |
| over_2_years |
16,007 (4.20%) |
| vehicle_damage |
|
| yes |
192,413 (50.49%) |
| no |
188,696 (49.51%) |
| health_annual_paid |
31,669 (24,405, 39,400) |
| policy_sales_channel |
133 (29, 152) |
| days_associated |
154 (82, 227) |
| response |
|
| yes |
46,710 (12.26%) |
| no |
334,399 (87.74%) |
More Detailed
Statistics
num_attributes <- df_cleaned %>%
select(age, health_annual_paid, days_associated)
descriptive_tab <- summarytools::descr(num_attributes, style = "rmarkdown") %>% round(2)
## Error : Can't find summarytools
kable(data.frame(descriptive_tab), format = "html") %>%
kableExtra::kable_styling(bootstrap_options = "striped", full_width = F)
|
|
age
|
days_associated
|
health_annual_paid
|
|
Mean
|
38.82
|
154.35
|
30564.39
|
|
Std.Dev
|
15.51
|
83.67
|
17213.16
|
|
Min
|
20.00
|
10.00
|
2630.00
|
|
Q1
|
25.00
|
82.00
|
24405.00
|
|
Median
|
36.00
|
154.00
|
31669.00
|
|
Q3
|
49.00
|
227.00
|
39400.00
|
|
Max
|
85.00
|
299.00
|
540165.00
|
|
MAD
|
17.79
|
108.23
|
11125.43
|
|
IQR
|
24.00
|
145.00
|
14995.00
|
|
CV
|
0.40
|
0.54
|
0.56
|
|
Skewness
|
0.67
|
0.00
|
1.77
|
|
SE.Skewness
|
0.00
|
0.00
|
0.00
|
|
Kurtosis
|
-0.57
|
-1.20
|
34.00
|
|
N.Valid
|
381109.00
|
381109.00
|
381109.00
|
|
Pct.Valid
|
100.00
|
100.00
|
100.00
|
Visualization
# Age
age_plt <- num_attributes %>%
ggplot(aes(x = age)) +
geom_histogram(
aes(y = after_stat(density)),
binwidth = 1,
color = "gray",
fill = "lightblue",
alpha = 0.5
) +
geom_density(color = "blue") +
labs(x = "Age", y = "Density", title = "Customers \nAge Distribution") +
theme_minimal()
# health_annual_paid
paid_plt <- num_attributes %>%
ggplot(aes(x = health_annual_paid)) +
geom_histogram(
aes(y = after_stat(density)),
binwidth = 10000,
color = "gray",
fill = "lightblue",
alpha = 0.5
) +
geom_density(color = "blue") +
labs(x = "Health Annual Paid", y = "Density", title = "Customers \nPayments Distribution") +
theme_minimal()
# days_associated
days_plt <- num_attributes %>%
ggplot(aes(x = days_associated)) +
geom_histogram(
aes(y = after_stat(density)),
color = "gray",
fill = "lightblue",
alpha = 0.5
) +
geom_density(color = "blue") +
labs(x = "Days Associated", y = "Density", title = "Customers Days \nAssociated Distribution") +
theme_minimal()
gridExtra::grid.arrange(age_plt, paid_plt, days_plt, ncol = 3)

num_names <- names(num_attributes)
cat_attributes <- df_cleaned %>% select(-id, -one_of(num_names))
gender_plt <- cat_attributes %>%
ggplot(aes(x = gender)) +
geom_bar(aes(fill = gender), show.legend = F) +
labs(x = "Gender", y = "#", title = "Customers Gender") +
theme_minimal()
driving_license_plt <- cat_attributes %>%
ggplot(aes(x = driving_license)) +
geom_bar(aes(fill = driving_license), show.legend = F) +
labs(x = "Driving License", y = "#", title = "Customers \nDriving License") +
theme_minimal()
region_code_plt <- cat_attributes %>%
ggplot(aes(x = region_code)) +
geom_bar(aes(fill = factor(region_code)), show.legend = F) +
labs(x = "Region Code", y = "#", title = "Customers \nRegion Code") +
theme_minimal()
previously_insured_plt <- cat_attributes %>%
ggplot(aes(x = previously_insured)) +
geom_bar(aes(fill = previously_insured), show.legend = F) +
labs(x = "Previously Insured", y = "#", title = "Customers \nPreviously Insured") +
theme_minimal()
vehicle_age_plt <- cat_attributes %>%
ggplot(aes(x = vehicle_age)) +
geom_bar(aes(fill = vehicle_age), show.legend = F) +
labs(x = "vehicle_age", y = "#", title = "Customers \nVehicle Age") +
theme_minimal()
vehicle_damage_plt <- cat_attributes %>%
ggplot(aes(x = vehicle_damage)) +
geom_bar(aes(fill = vehicle_damage), show.legend = F) +
labs(x = "vehicle_damage", y = "#", title = "Customers \nVehicle Damage") +
theme_minimal()
policy_sales_channel_plt <- cat_attributes %>%
ggplot(aes(x = policy_sales_channel)) +
geom_bar(aes(fill = factor(policy_sales_channel)), show.legend = F) +
labs(x = "policy_sales_channel", y = "#", title = "Customers \nPolicy Sales Channel") +
theme_minimal()
response_plt <- cat_attributes %>%
ggplot(aes(x = response)) +
geom_bar(aes(fill = response), show.legend = F) +
labs(x = "response", y = "#", title = "Customers Response") +
theme_minimal()
gridExtra::grid.arrange(
gender_plt, driving_license_plt, region_code_plt, previously_insured_plt,
vehicle_age_plt, vehicle_damage_plt, policy_sales_channel_plt, response_plt,
ncol = 2, nrow=4
)
