1 Imports

library(tidyverse)
library(janitor)
library(gtsummary)
library(summarytools)
library(kableExtra)
library(knitr)

2 Data Collection

df <- read_csv("01.data/train.csv")
glimpse(df)

## Rows: 381,109
## Columns: 12
## $ id                   <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
## $ Gender               <chr> "Male", "Male", "Male", "Male", "Female", "Female…
## $ Age                  <dbl> 44, 76, 47, 21, 29, 24, 23, 56, 24, 32, 47, 24, 4…
## $ Driving_License      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ Region_Code          <dbl> 28, 3, 28, 11, 41, 33, 11, 28, 3, 6, 35, 50, 15, …
## $ Previously_Insured   <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0…
## $ Vehicle_Age          <chr> "> 2 Years", "1-2 Year", "> 2 Years", "< 1 Year",…
## $ Vehicle_Damage       <chr> "Yes", "No", "Yes", "No", "No", "Yes", "Yes", "Ye…
## $ Annual_Premium       <dbl> 40454, 33536, 38294, 28619, 27496, 2630, 23367, 3…
## $ Policy_Sales_Channel <dbl> 26, 26, 26, 152, 152, 160, 152, 26, 152, 152, 124…
## $ Vintage              <dbl> 217, 183, 27, 203, 39, 176, 249, 72, 28, 80, 46, …
## $ Response             <dbl> 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0…

3 Data Cleaning

df1 <- janitor::clean_names(df) %>% 
  
  rename(
    days_associated = vintage,
    health_annual_paid = annual_premium
  ) %>% 
  
  mutate(
    across(where(is.character), tolower),
    driving_license = ifelse(driving_license == 1, "yes", "no"),
    previously_insured = ifelse(previously_insured == 1, "yes", "no"),
    response = ifelse(response == 1, "yes", "no"),
    vehicle_age = case_when(
      vehicle_age == "< 1 year" ~ "below_1_year",
      vehicle_age == "1-2 year" ~ "between_1_2_years",
      vehicle_age == "> 2 years" ~ "over_2_years"
    )
  ) %>% 
  
  mutate_if(is.character, as.factor) %>% 

  # changing yes/no levels
  mutate(
    response = factor(response, levels = c("yes", "no")),
    driving_license = factor(driving_license, levels = c("yes", "no")),
    previously_insured = factor(previously_insured, levels = c("yes", "no")),
    vehicle_damage = factor(vehicle_damage, levels = c("yes", "no"))
  )

glimpse(df1)

## Rows: 381,109
## Columns: 12
## $ id                   <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
## $ gender               <fct> male, male, male, male, female, female, male, fem…
## $ age                  <dbl> 44, 76, 47, 21, 29, 24, 23, 56, 24, 32, 47, 24, 4…
## $ driving_license      <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes,…
## $ region_code          <dbl> 28, 3, 28, 11, 41, 33, 11, 28, 3, 6, 35, 50, 15, …
## $ previously_insured   <fct> no, no, no, yes, yes, no, no, no, yes, yes, no, y…
## $ vehicle_age          <fct> over_2_years, between_1_2_years, over_2_years, be…
## $ vehicle_damage       <fct> yes, no, yes, no, no, yes, yes, yes, no, no, yes,…
## $ health_annual_paid   <dbl> 40454, 33536, 38294, 28619, 27496, 2630, 23367, 3…
## $ policy_sales_channel <dbl> 26, 26, 26, 152, 152, 160, 152, 26, 152, 152, 124…
## $ days_associated      <dbl> 217, 183, 27, 203, 39, 176, 249, 72, 28, 80, 46, …
## $ response             <fct> yes, no, yes, no, no, no, no, yes, no, no, yes, n…

# save df_cleaned as RDS
saveRDS(df1, "df_cleaned.rds")

3.1 Data Types

variable_calsses <- tibble(variables = names(df1),
       types = unlist(lapply(df1, class))
)

variable_calsses

4 Column Description

variables <- df1 %>% names()

description <- c(
  "Unique ID for the customer",
  "Gender of the customer",
  "Age of the customer",
  "Customer does not have DL (yes/no)",
  "Unique code for the region of the customer",
  "Customer already has Vehicle Insurance (yes/no)",
  "Age of the Vehicle",
  "Customer got his/her vehicle damaged in the past (yes/no)",
  "The amount customer needs to pay as premium in the year",
  "Anonymized Code for the channel of outreaching to the customer ie. Different Agents, Over Mail, Over Phone, In Person, etc",
  "Number of Days, Customer has been associated with the company",
  "Customer is interested in car insurance (yes/no)"
)

df_description <- tibble(variables = variables, description = description)

kable(df_description, format = "html") %>% 
  kableExtra::kable_styling(bootstrap_options = "striped", full_width = F)

variables	description
id	Unique ID for the customer
gender	Gender of the customer
age	Age of the customer
driving_license	Customer does not have DL (yes/no)
region_code	Unique code for the region of the customer
previously_insured	Customer already has Vehicle Insurance (yes/no)
vehicle_age	Age of the Vehicle
vehicle_damage	Customer got his/her vehicle damaged in the past (yes/no)
health_annual_paid	The amount customer needs to pay as premium in the year
policy_sales_channel	Anonymized Code for the channel of outreaching to the customer ie. Different Agents, Over Mail, Over Phone, In Person, etc
days_associated	Number of Days, Customer has been associated with the company
response	Customer is interested in car insurance (yes/no)

5 Descriptive Statistics

# read cleaned data
df_cleaned <- readRDS("df_cleaned.rds")
glimpse(df_cleaned)

## Rows: 381,109
## Columns: 12
## $ id                   <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
## $ gender               <fct> male, male, male, male, female, female, male, fem…
## $ age                  <dbl> 44, 76, 47, 21, 29, 24, 23, 56, 24, 32, 47, 24, 4…
## $ driving_license      <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes,…
## $ region_code          <dbl> 28, 3, 28, 11, 41, 33, 11, 28, 3, 6, 35, 50, 15, …
## $ previously_insured   <fct> no, no, no, yes, yes, no, no, no, yes, yes, no, y…
## $ vehicle_age          <fct> over_2_years, between_1_2_years, over_2_years, be…
## $ vehicle_damage       <fct> yes, no, yes, no, no, yes, yes, yes, no, no, yes,…
## $ health_annual_paid   <dbl> 40454, 33536, 38294, 28619, 27496, 2630, 23367, 3…
## $ policy_sales_channel <dbl> 26, 26, 26, 152, 152, 160, 152, 26, 152, 152, 124…
## $ days_associated      <dbl> 217, 183, 27, 203, 39, 176, 249, 72, 28, 80, 46, …
## $ response             <fct> yes, no, yes, no, no, no, no, yes, no, no, yes, n…

Check Data Structure so far:

skimr::skim(df_cleaned)

Data summary
Name	df_cleaned
Number of rows	381109
Number of columns	12
_______________________
Column type frequency:
factor	6
numeric	6
________________________
Group variables	None

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
gender	1	FALSE	2	mal: 206089, fem: 175020
driving_license	1	FALSE	2	yes: 380297, no: 812
previously_insured	1	FALSE	2	no: 206481, yes: 174628
vehicle_age	1	FALSE	3	bet: 200316, bel: 164786, ove: 16007
vehicle_damage	1	FALSE	2	yes: 192413, no: 188696
response	1	FALSE	2	no: 334399, yes: 46710

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
id	1	190555.00	110016.84	1	95278	190555	285832	381109	▇▇▇▇▇
age	1	38.82	15.51	20	25	36	49	85	▇▃▃▂▁
region_code	1	26.39	13.23	0	15	28	35	52	▃▂▇▃▃
health_annual_paid	1	30564.39	17213.16	2630	24405	31669	39400	540165	▇▁▁▁▁
policy_sales_channel	1	112.03	54.20	1	29	133	152	163	▅▁▁▃▇
days_associated	1	154.35	83.67	10	82	154	227	299	▇▇▇▇▇

5.1 General Overview

df_cleaned %>% 
  select(-id) %>% 
  tbl_summary(
    type = list(response ~ "categorical",
                driving_license ~ "categorical",
                previously_insured ~ "categorical",
                vehicle_damage ~ "categorical"),
    digits = list(all_categorical() ~ c(0, 2))
  )

Characteristic	N = 381,109¹
gender
female	175,020 (45.92%)
male	206,089 (54.08%)
age	36 (25, 49)
driving_license
yes	380,297 (99.79%)
no	812 (0.21%)
region_code	28 (15, 35)
previously_insured
yes	174,628 (45.82%)
no	206,481 (54.18%)
vehicle_age
below_1_year	164,786 (43.24%)
between_1_2_years	200,316 (52.56%)
over_2_years	16,007 (4.20%)
vehicle_damage
yes	192,413 (50.49%)
no	188,696 (49.51%)
health_annual_paid	31,669 (24,405, 39,400)
policy_sales_channel	133 (29, 152)
days_associated	154 (82, 227)
response
yes	46,710 (12.26%)
no	334,399 (87.74%)
¹ n (%); Median (Q1, Q3)

5.2 More Detailed Statistics

num_attributes <- df_cleaned %>% 
  select(age, health_annual_paid, days_associated)

descriptive_tab <- summarytools::descr(num_attributes, style = "rmarkdown") %>% round(2)

## Error : Can't find summarytools

kable(data.frame(descriptive_tab), format = "html") %>% 
  kableExtra::kable_styling(bootstrap_options = "striped", full_width = F)

	age	days_associated	health_annual_paid
Mean	38.82	154.35	30564.39
Std.Dev	15.51	83.67	17213.16
Min	20.00	10.00	2630.00
Q1	25.00	82.00	24405.00
Median	36.00	154.00	31669.00
Q3	49.00	227.00	39400.00
Max	85.00	299.00	540165.00
MAD	17.79	108.23	11125.43
IQR	24.00	145.00	14995.00
CV	0.40	0.54	0.56
Skewness	0.67	0.00	1.77
SE.Skewness	0.00	0.00	0.00
Kurtosis	-0.57	-1.20	34.00
N.Valid	381109.00	381109.00	381109.00
Pct.Valid	100.00	100.00	100.00

5.3 Visualization

Numerical Attributes

# Age
age_plt <- num_attributes %>% 
  ggplot(aes(x = age)) + 
  geom_histogram(
    aes(y = after_stat(density)),
    binwidth = 1,
    color = "gray",
    fill = "lightblue",
    alpha = 0.5
  ) + 
  geom_density(color = "blue") + 
  labs(x = "Age", y = "Density", title = "Customers \nAge Distribution") + 
  theme_minimal()


# health_annual_paid
paid_plt <- num_attributes %>% 
  ggplot(aes(x = health_annual_paid)) + 
  geom_histogram(
    aes(y = after_stat(density)),
    binwidth = 10000,
    color = "gray",
    fill = "lightblue",
    alpha = 0.5
  ) + 
  geom_density(color = "blue") + 
  labs(x = "Health Annual Paid", y = "Density", title = "Customers \nPayments Distribution") + 
  theme_minimal()


# days_associated
days_plt <- num_attributes %>% 
  ggplot(aes(x = days_associated)) + 
  geom_histogram(
    aes(y = after_stat(density)),
    color = "gray",
    fill = "lightblue",
    alpha = 0.5
  ) + 
  geom_density(color = "blue") + 
  labs(x = "Days Associated", y = "Density", title = "Customers Days \nAssociated Distribution") + 
  theme_minimal()

gridExtra::grid.arrange(age_plt, paid_plt, days_plt, ncol = 3)

Categorical Attributes

num_names <- names(num_attributes)
cat_attributes <- df_cleaned %>% select(-id, -one_of(num_names))

gender_plt <- cat_attributes %>% 
  ggplot(aes(x = gender)) + 
  geom_bar(aes(fill = gender), show.legend = F) + 
  labs(x = "Gender", y = "#", title = "Customers Gender") + 
  theme_minimal()

driving_license_plt <- cat_attributes %>% 
  ggplot(aes(x = driving_license)) + 
  geom_bar(aes(fill = driving_license), show.legend = F) + 
  labs(x = "Driving License", y = "#", title = "Customers \nDriving License") + 
  theme_minimal()

region_code_plt <- cat_attributes %>% 
  ggplot(aes(x = region_code)) + 
  geom_bar(aes(fill = factor(region_code)), show.legend = F) + 
  labs(x = "Region Code", y = "#", title = "Customers \nRegion Code") + 
  theme_minimal()

previously_insured_plt <- cat_attributes %>% 
  ggplot(aes(x = previously_insured)) + 
  geom_bar(aes(fill = previously_insured), show.legend = F) + 
  labs(x = "Previously Insured", y = "#", title = "Customers \nPreviously Insured") + 
  theme_minimal()

vehicle_age_plt <- cat_attributes %>% 
  ggplot(aes(x = vehicle_age)) + 
  geom_bar(aes(fill = vehicle_age), show.legend = F) + 
  labs(x = "vehicle_age", y = "#", title = "Customers \nVehicle Age") + 
  theme_minimal()

vehicle_damage_plt <- cat_attributes %>% 
  ggplot(aes(x = vehicle_damage)) + 
  geom_bar(aes(fill = vehicle_damage), show.legend = F) + 
  labs(x = "vehicle_damage", y = "#", title = "Customers \nVehicle Damage") + 
  theme_minimal()

policy_sales_channel_plt <- cat_attributes %>% 
  ggplot(aes(x = policy_sales_channel)) + 
  geom_bar(aes(fill = factor(policy_sales_channel)), show.legend = F) + 
  labs(x = "policy_sales_channel", y = "#", title = "Customers \nPolicy Sales Channel") + 
  theme_minimal()

response_plt <- cat_attributes %>% 
  ggplot(aes(x = response)) + 
  geom_bar(aes(fill = response), show.legend = F) + 
  labs(x = "response", y = "#", title = "Customers Response") + 
  theme_minimal()

gridExtra::grid.arrange(
  gender_plt, driving_license_plt, region_code_plt, previously_insured_plt,
  vehicle_age_plt, vehicle_damage_plt, policy_sales_channel_plt, response_plt,
  ncol = 2, nrow=4
)

Data Science With R

Felipe Vasconcelos

07/10/2024

1 Imports

2 Data Collection

3 Data Cleaning

3.1 Data Types

4 Column Description

5 Descriptive Statistics

5.1 General Overview

5.2 More Detailed Statistics

5.3 Visualization