1 Imports

library(tidyverse)
library(janitor)
library(gtsummary)
library(summarytools)
library(kableExtra)
library(knitr)

2 Data Collection

df <- read_csv("01.data/train.csv")
glimpse(df)
## Rows: 381,109
## Columns: 12
## $ id                   <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
## $ Gender               <chr> "Male", "Male", "Male", "Male", "Female", "Female…
## $ Age                  <dbl> 44, 76, 47, 21, 29, 24, 23, 56, 24, 32, 47, 24, 4…
## $ Driving_License      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ Region_Code          <dbl> 28, 3, 28, 11, 41, 33, 11, 28, 3, 6, 35, 50, 15, …
## $ Previously_Insured   <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0…
## $ Vehicle_Age          <chr> "> 2 Years", "1-2 Year", "> 2 Years", "< 1 Year",…
## $ Vehicle_Damage       <chr> "Yes", "No", "Yes", "No", "No", "Yes", "Yes", "Ye…
## $ Annual_Premium       <dbl> 40454, 33536, 38294, 28619, 27496, 2630, 23367, 3…
## $ Policy_Sales_Channel <dbl> 26, 26, 26, 152, 152, 160, 152, 26, 152, 152, 124…
## $ Vintage              <dbl> 217, 183, 27, 203, 39, 176, 249, 72, 28, 80, 46, …
## $ Response             <dbl> 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0…

3 Data Cleaning

df1 <- janitor::clean_names(df) %>% 
  
  rename(
    days_associated = vintage,
    health_annual_paid = annual_premium
  ) %>% 
  
  mutate(
    across(where(is.character), tolower),
    driving_license = ifelse(driving_license == 1, "yes", "no"),
    previously_insured = ifelse(previously_insured == 1, "yes", "no"),
    response = ifelse(response == 1, "yes", "no"),
    vehicle_age = case_when(
      vehicle_age == "< 1 year" ~ "below_1_year",
      vehicle_age == "1-2 year" ~ "between_1_2_years",
      vehicle_age == "> 2 years" ~ "over_2_years"
    )
  ) %>% 
  
  mutate_if(is.character, as.factor) %>% 

  # changing yes/no levels
  mutate(
    response = factor(response, levels = c("yes", "no")),
    driving_license = factor(driving_license, levels = c("yes", "no")),
    previously_insured = factor(previously_insured, levels = c("yes", "no")),
    vehicle_damage = factor(vehicle_damage, levels = c("yes", "no"))
  )

glimpse(df1)
## Rows: 381,109
## Columns: 12
## $ id                   <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
## $ gender               <fct> male, male, male, male, female, female, male, fem…
## $ age                  <dbl> 44, 76, 47, 21, 29, 24, 23, 56, 24, 32, 47, 24, 4…
## $ driving_license      <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes,…
## $ region_code          <dbl> 28, 3, 28, 11, 41, 33, 11, 28, 3, 6, 35, 50, 15, …
## $ previously_insured   <fct> no, no, no, yes, yes, no, no, no, yes, yes, no, y…
## $ vehicle_age          <fct> over_2_years, between_1_2_years, over_2_years, be…
## $ vehicle_damage       <fct> yes, no, yes, no, no, yes, yes, yes, no, no, yes,…
## $ health_annual_paid   <dbl> 40454, 33536, 38294, 28619, 27496, 2630, 23367, 3…
## $ policy_sales_channel <dbl> 26, 26, 26, 152, 152, 160, 152, 26, 152, 152, 124…
## $ days_associated      <dbl> 217, 183, 27, 203, 39, 176, 249, 72, 28, 80, 46, …
## $ response             <fct> yes, no, yes, no, no, no, no, yes, no, no, yes, n…
# save df_cleaned as RDS
saveRDS(df1, "df_cleaned.rds")

3.1 Data Types

variable_calsses <- tibble(variables = names(df1),
       types = unlist(lapply(df1, class))
)

variable_calsses

4 Column Description

variables <- df1 %>% names()

description <- c(
  "Unique ID for the customer",
  "Gender of the customer",
  "Age of the customer",
  "Customer does not have DL (yes/no)",
  "Unique code for the region of the customer",
  "Customer already has Vehicle Insurance (yes/no)",
  "Age of the Vehicle",
  "Customer got his/her vehicle damaged in the past (yes/no)",
  "The amount customer needs to pay as premium in the year",
  "Anonymized Code for the channel of outreaching to the customer ie. Different Agents, Over Mail, Over Phone, In Person, etc",
  "Number of Days, Customer has been associated with the company",
  "Customer is interested in car insurance (yes/no)"
)

df_description <- tibble(variables = variables, description = description)

kable(df_description, format = "html") %>% 
  kableExtra::kable_styling(bootstrap_options = "striped", full_width = F)
variables description
id Unique ID for the customer
gender Gender of the customer
age Age of the customer
driving_license Customer does not have DL (yes/no)
region_code Unique code for the region of the customer
previously_insured Customer already has Vehicle Insurance (yes/no)
vehicle_age Age of the Vehicle
vehicle_damage Customer got his/her vehicle damaged in the past (yes/no)
health_annual_paid The amount customer needs to pay as premium in the year
policy_sales_channel Anonymized Code for the channel of outreaching to the customer ie. Different Agents, Over Mail, Over Phone, In Person, etc
days_associated Number of Days, Customer has been associated with the company
response Customer is interested in car insurance (yes/no)

5 Descriptive Statistics

# read cleaned data
df_cleaned <- readRDS("df_cleaned.rds")
glimpse(df_cleaned)
## Rows: 381,109
## Columns: 12
## $ id                   <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
## $ gender               <fct> male, male, male, male, female, female, male, fem…
## $ age                  <dbl> 44, 76, 47, 21, 29, 24, 23, 56, 24, 32, 47, 24, 4…
## $ driving_license      <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes,…
## $ region_code          <dbl> 28, 3, 28, 11, 41, 33, 11, 28, 3, 6, 35, 50, 15, …
## $ previously_insured   <fct> no, no, no, yes, yes, no, no, no, yes, yes, no, y…
## $ vehicle_age          <fct> over_2_years, between_1_2_years, over_2_years, be…
## $ vehicle_damage       <fct> yes, no, yes, no, no, yes, yes, yes, no, no, yes,…
## $ health_annual_paid   <dbl> 40454, 33536, 38294, 28619, 27496, 2630, 23367, 3…
## $ policy_sales_channel <dbl> 26, 26, 26, 152, 152, 160, 152, 26, 152, 152, 124…
## $ days_associated      <dbl> 217, 183, 27, 203, 39, 176, 249, 72, 28, 80, 46, …
## $ response             <fct> yes, no, yes, no, no, no, no, yes, no, no, yes, n…
  • Check Data Structure so far:
skimr::skim(df_cleaned)
Data summary
Name df_cleaned
Number of rows 381109
Number of columns 12
_______________________
Column type frequency:
factor 6
numeric 6
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
gender 0 1 FALSE 2 mal: 206089, fem: 175020
driving_license 0 1 FALSE 2 yes: 380297, no: 812
previously_insured 0 1 FALSE 2 no: 206481, yes: 174628
vehicle_age 0 1 FALSE 3 bet: 200316, bel: 164786, ove: 16007
vehicle_damage 0 1 FALSE 2 yes: 192413, no: 188696
response 0 1 FALSE 2 no: 334399, yes: 46710

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1 190555.00 110016.84 1 95278 190555 285832 381109 ▇▇▇▇▇
age 0 1 38.82 15.51 20 25 36 49 85 ▇▃▃▂▁
region_code 0 1 26.39 13.23 0 15 28 35 52 ▃▂▇▃▃
health_annual_paid 0 1 30564.39 17213.16 2630 24405 31669 39400 540165 ▇▁▁▁▁
policy_sales_channel 0 1 112.03 54.20 1 29 133 152 163 ▅▁▁▃▇
days_associated 0 1 154.35 83.67 10 82 154 227 299 ▇▇▇▇▇

5.1 General Overview

df_cleaned %>% 
  select(-id) %>% 
  tbl_summary(
    type = list(response ~ "categorical",
                driving_license ~ "categorical",
                previously_insured ~ "categorical",
                vehicle_damage ~ "categorical"),
    digits = list(all_categorical() ~ c(0, 2))
  )
Characteristic N = 381,1091
gender
    female 175,020 (45.92%)
    male 206,089 (54.08%)
age 36 (25, 49)
driving_license
    yes 380,297 (99.79%)
    no 812 (0.21%)
region_code 28 (15, 35)
previously_insured
    yes 174,628 (45.82%)
    no 206,481 (54.18%)
vehicle_age
    below_1_year 164,786 (43.24%)
    between_1_2_years 200,316 (52.56%)
    over_2_years 16,007 (4.20%)
vehicle_damage
    yes 192,413 (50.49%)
    no 188,696 (49.51%)
health_annual_paid 31,669 (24,405, 39,400)
policy_sales_channel 133 (29, 152)
days_associated 154 (82, 227)
response
    yes 46,710 (12.26%)
    no 334,399 (87.74%)
1 n (%); Median (Q1, Q3)

5.2 More Detailed Statistics

num_attributes <- df_cleaned %>% 
  select(age, health_annual_paid, days_associated)
descriptive_tab <- summarytools::descr(num_attributes, style = "rmarkdown") %>% round(2)
## Error : Can't find summarytools
kable(data.frame(descriptive_tab), format = "html") %>% 
  kableExtra::kable_styling(bootstrap_options = "striped", full_width = F)
age days_associated health_annual_paid
Mean 38.82 154.35 30564.39
Std.Dev 15.51 83.67 17213.16
Min 20.00 10.00 2630.00
Q1 25.00 82.00 24405.00
Median 36.00 154.00 31669.00
Q3 49.00 227.00 39400.00
Max 85.00 299.00 540165.00
MAD 17.79 108.23 11125.43
IQR 24.00 145.00 14995.00
CV 0.40 0.54 0.56
Skewness 0.67 0.00 1.77
SE.Skewness 0.00 0.00 0.00
Kurtosis -0.57 -1.20 34.00
N.Valid 381109.00 381109.00 381109.00
Pct.Valid 100.00 100.00 100.00

5.3 Visualization

  • Numerical Attributes
# Age
age_plt <- num_attributes %>% 
  ggplot(aes(x = age)) + 
  geom_histogram(
    aes(y = after_stat(density)),
    binwidth = 1,
    color = "gray",
    fill = "lightblue",
    alpha = 0.5
  ) + 
  geom_density(color = "blue") + 
  labs(x = "Age", y = "Density", title = "Customers \nAge Distribution") + 
  theme_minimal()


# health_annual_paid
paid_plt <- num_attributes %>% 
  ggplot(aes(x = health_annual_paid)) + 
  geom_histogram(
    aes(y = after_stat(density)),
    binwidth = 10000,
    color = "gray",
    fill = "lightblue",
    alpha = 0.5
  ) + 
  geom_density(color = "blue") + 
  labs(x = "Health Annual Paid", y = "Density", title = "Customers \nPayments Distribution") + 
  theme_minimal()


# days_associated
days_plt <- num_attributes %>% 
  ggplot(aes(x = days_associated)) + 
  geom_histogram(
    aes(y = after_stat(density)),
    color = "gray",
    fill = "lightblue",
    alpha = 0.5
  ) + 
  geom_density(color = "blue") + 
  labs(x = "Days Associated", y = "Density", title = "Customers Days \nAssociated Distribution") + 
  theme_minimal()

gridExtra::grid.arrange(age_plt, paid_plt, days_plt, ncol = 3)

  • Categorical Attributes
num_names <- names(num_attributes)
cat_attributes <- df_cleaned %>% select(-id, -one_of(num_names))
gender_plt <- cat_attributes %>% 
  ggplot(aes(x = gender)) + 
  geom_bar(aes(fill = gender), show.legend = F) + 
  labs(x = "Gender", y = "#", title = "Customers Gender") + 
  theme_minimal()

driving_license_plt <- cat_attributes %>% 
  ggplot(aes(x = driving_license)) + 
  geom_bar(aes(fill = driving_license), show.legend = F) + 
  labs(x = "Driving License", y = "#", title = "Customers \nDriving License") + 
  theme_minimal()

region_code_plt <- cat_attributes %>% 
  ggplot(aes(x = region_code)) + 
  geom_bar(aes(fill = factor(region_code)), show.legend = F) + 
  labs(x = "Region Code", y = "#", title = "Customers \nRegion Code") + 
  theme_minimal()

previously_insured_plt <- cat_attributes %>% 
  ggplot(aes(x = previously_insured)) + 
  geom_bar(aes(fill = previously_insured), show.legend = F) + 
  labs(x = "Previously Insured", y = "#", title = "Customers \nPreviously Insured") + 
  theme_minimal()

vehicle_age_plt <- cat_attributes %>% 
  ggplot(aes(x = vehicle_age)) + 
  geom_bar(aes(fill = vehicle_age), show.legend = F) + 
  labs(x = "vehicle_age", y = "#", title = "Customers \nVehicle Age") + 
  theme_minimal()

vehicle_damage_plt <- cat_attributes %>% 
  ggplot(aes(x = vehicle_damage)) + 
  geom_bar(aes(fill = vehicle_damage), show.legend = F) + 
  labs(x = "vehicle_damage", y = "#", title = "Customers \nVehicle Damage") + 
  theme_minimal()

policy_sales_channel_plt <- cat_attributes %>% 
  ggplot(aes(x = policy_sales_channel)) + 
  geom_bar(aes(fill = factor(policy_sales_channel)), show.legend = F) + 
  labs(x = "policy_sales_channel", y = "#", title = "Customers \nPolicy Sales Channel") + 
  theme_minimal()

response_plt <- cat_attributes %>% 
  ggplot(aes(x = response)) + 
  geom_bar(aes(fill = response), show.legend = F) + 
  labs(x = "response", y = "#", title = "Customers Response") + 
  theme_minimal()

gridExtra::grid.arrange(
  gender_plt, driving_license_plt, region_code_plt, previously_insured_plt,
  vehicle_age_plt, vehicle_damage_plt, policy_sales_channel_plt, response_plt,
  ncol = 2, nrow=4
)