DS

This file is the preprocessing procedure for the data. - cleaning: yes. checked for NA and outliers - feature engineering: yes. created 3 new factor variables

and then data description tells us several things: - the business situation: contracts are typically 6 months - 1 year type, no surprise in the autoinsurance industry.

customer segment (many young customers, then middle aged, then really old aged customers)

revenue (young clients many and offered lower premium, middle aged are less in number but provide the main income source for the company)

customer service/customer loyalty (they said no after they are reimbursed or they didnt wish to renew their contracts after 1 year).

overall, those who have vehicles who have been damaged before said yes, otherwise they likely said no, but we had some features to target anyway.

the picture of our possible targeted customer: middle aged (35-50), contract has duration <1 year, new vehicle, channel 152, region 28.

#Initial steps

library(readr)
train_13 <- read_delim("train_13.csv", delim = ";", escape_double = FALSE,
                                       col_types = cols(id = col_character(),
                                                  Gender = col_factor(),
                                                  Driving_License = col_factor(), 
                                                  Region_Code = col_factor(),
                                                  Previously_Insured = col_factor(),
                                                  Vehicle_Age = col_factor(),
                                                  Policy_Sales_Channel = col_factor(),
                                                  Vehicle_Damage = col_factor(),
                                                  Response = col_factor()),
                                       na = "NA", trim_ws = TRUE)
attach(train_13)
str(train_13)

## spc_tbl_ [50,000 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ id                  : chr [1:50000] "1" "2" "3" "4" ...
##  $ Gender              : Factor w/ 2 levels "Female","Male": 1 2 1 1 2 2 2 2 1 2 ...
##  $ Age                 : num [1:50000] 24 48 71 74 43 44 49 31 21 60 ...
##  $ Driving_License     : Factor w/ 2 levels "1","0": 1 1 1 1 1 2 1 1 1 1 ...
##  $ Region_Code         : Factor w/ 53 levels "40","16","35",..: 1 2 3 4 5 6 5 7 8 8 ...
##  $ Previously_Insured  : Factor w/ 2 levels "1","0": 1 2 2 2 2 2 2 1 1 2 ...
##  $ Vehicle_Age         : Factor w/ 3 levels "< 1 Year","1-2 Year",..: 1 2 2 1 2 2 2 1 1 3 ...
##  $ Vehicle_Damage      : Factor w/ 2 levels "No","Yes": 1 2 2 2 2 2 2 1 1 2 ...
##  $ Annual_Premium      : num [1:50000] 21795 28274 40297 37214 2575100 ...
##  $ Policy_Sales_Channel: Factor w/ 129 levels "152","26","163",..: 1 2 3 1 4 3 2 1 5 4 ...
##  $ Vintage             : num [1:50000] 292 115 113 156 200 118 247 33 155 122 ...
##  $ Response            : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 2 1 1 2 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   id = col_character(),
##   ..   Gender = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   Age = col_double(),
##   ..   Driving_License = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   Region_Code = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   Previously_Insured = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   Vehicle_Age = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   Vehicle_Damage = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   Annual_Premium = col_double(),
##   ..   Policy_Sales_Channel = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   Vintage = col_double(),
##   ..   Response = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE)
##   .. )
##  - attr(*, "problems")=<externalptr>

GENDER <- train_13$Gender
AGE <- train_13$Age
LICENSE <- train_13$Driving_License
REGION <- train_13$Region_Code
PREV_INS <- train_13$Previously_Insured
V_AGE <- train_13$Vehicle_Age
V_DAMAGE <- train_13$Vehicle_Damage
PREMIUM <- train_13$Annual_Premium
CHANNEL <- train_13$Policy_Sales_Channel
VINTAGE <- train_13$Vintage
RES <- train_13$Response

dat <- data.frame(
  GENDER,
  AGE,
  LICENSE,
  REGION,
  PREV_INS,
  V_AGE,
  V_DAMAGE,
  PREMIUM,
  CHANNEL,
  VINTAGE,
  RES
)
attach(dat)

## The following objects are masked _by_ .GlobalEnv:
## 
##     AGE, CHANNEL, GENDER, LICENSE, PREMIUM, PREV_INS, REGION, RES,
##     V_AGE, V_DAMAGE, VINTAGE

head(dat)

##   GENDER AGE LICENSE REGION PREV_INS    V_AGE V_DAMAGE PREMIUM CHANNEL VINTAGE
## 1 Female  24       1     40        1 < 1 Year       No   21795     152     292
## 2   Male  48       1     16        0 1-2 Year      Yes   28274      26     115
## 3 Female  71       1     35        0 1-2 Year      Yes   40297     163     113
## 4 Female  74       1      8        0 < 1 Year      Yes   37214     152     156
## 5   Male  43       1     28        0 1-2 Year      Yes 2575100     124     200
## 6   Male  44       0     31        0 1-2 Year      Yes    2630     163     118
##   RES
## 1   0
## 2   0
## 3   1
## 4   0
## 5   0
## 6   0

library(readr)
test_13 <- read_delim("test.csv", delim = ",", escape_double = FALSE,
                                       col_types = cols(id = col_character(),
                                                  Gender = col_factor(),
                                                  Driving_License = col_factor(), 
                                                  Region_Code = col_factor(),
                                                  Previously_Insured = col_factor(),
                                                  Vehicle_Age = col_factor(),
                                                  Policy_Sales_Channel = col_factor(),
                                                  Vehicle_Damage = col_factor(),
                                                  Response = col_factor()),
                                       na = "NA", trim_ws = TRUE)

## Warning: The following named parsers don't match the column names: Response

str(test_13)

## spc_tbl_ [30,000 × 11] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ id                  : chr [1:30000] "1" "2" "3" "4" ...
##  $ Gender              : Factor w/ 2 levels "Male","Female": 1 2 1 2 2 1 2 2 1 2 ...
##  $ Age                 : num [1:30000] 44 60 30 26 29 48 25 22 27 32 ...
##  $ Driving_License     : Factor w/ 2 levels "1","0": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Region_Code         : Factor w/ 53 levels "28","33","30",..: 1 2 3 3 4 5 6 7 7 8 ...
##  $ Previously_Insured  : Factor w/ 2 levels "0","1": 1 1 1 2 2 1 2 2 2 1 ...
##  $ Vehicle_Age         : Factor w/ 3 levels "> 2 Years","1-2 Year",..: 1 2 3 3 3 2 3 3 3 2 ...
##  $ Vehicle_Damage      : Factor w/ 2 levels "Yes","No": 1 1 1 2 2 2 2 2 2 1 ...
##  $ Annual_Premium      : num [1:30000] 40454 32363 24550 31136 32923 ...
##  $ Policy_Sales_Channel: Factor w/ 116 levels "26","124","152",..: 1 2 2 3 3 2 3 3 4 1 ...
##  $ Vintage             : num [1:30000] 217 102 45 186 34 246 62 156 77 166 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   id = col_character(),
##   ..   Gender = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   Age = col_double(),
##   ..   Driving_License = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   Region_Code = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   Previously_Insured = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   Vehicle_Age = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   Vehicle_Damage = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   Annual_Premium = col_double(),
##   ..   Policy_Sales_Channel = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   Vintage = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

colSums(is.na(test_13))

##                   id               Gender                  Age 
##                    0                    0                    0 
##      Driving_License          Region_Code   Previously_Insured 
##                    0                    0                    0 
##          Vehicle_Age       Vehicle_Damage       Annual_Premium 
##                    0                    0                    0 
## Policy_Sales_Channel              Vintage 
##                    0                    0

colSums(is.na(train_13))

##                   id               Gender                  Age 
##                    0                    0                    0 
##      Driving_License          Region_Code   Previously_Insured 
##                    0                    0                    0 
##          Vehicle_Age       Vehicle_Damage       Annual_Premium 
##                    0                    0                    0 
## Policy_Sales_Channel              Vintage             Response 
##                    0                    0                    0

colSums(is.na(dat))

##   GENDER      AGE  LICENSE   REGION PREV_INS    V_AGE V_DAMAGE  PREMIUM 
##        0        0        0        0        0        0        0        0 
##  CHANNEL  VINTAGE      RES 
##        0        0        0

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ purrr     1.0.2
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(reprex)
library(purrr)
library(dplyr)
summary(dat)

##     GENDER           AGE         LICENSE       REGION      PREV_INS 
##  Female:21934   Min.   : 20.00   1:41016   28     :14847   1:15275  
##  Male  :28066   1st Qu.: 27.00   0: 8984   8      : 3822   0:34725  
##                 Median : 40.00             46     : 2631            
##                 Mean   : 42.19             41     : 2488            
##                 3rd Qu.: 51.00             15     : 1599            
##                 Max.   :137.00             30     : 1503            
##                                            (Other):23110            
##        V_AGE       V_DAMAGE       PREMIUM           CHANNEL         VINTAGE    
##  < 1 Year :17229   No :16873   Min.   :   2630   152    :13440   Min.   :  10  
##  1-2 Year :30035   Yes:33127   1st Qu.:  23939   26     :12057   1st Qu.:  85  
##  > 2 Years: 2736               Median :  31462   124    :11518   Median : 161  
##                                Mean   : 145985   160    : 2096   Mean   : 223  
##                                3rd Qu.:  38665   156    : 1932   3rd Qu.: 236  
##                                Max.   :4997200   122    : 1402   Max.   :5960  
##                                                  (Other): 7555                 
##  RES      
##  0:37773  
##  1:12227  
##           
##           
##           
##           
##

dat %>% 
  group_by(GENDER) %>% 
  summarise(percentage = round(n() / 500, 2)) %>%
  arrange(desc(percentage))

## # A tibble: 2 × 2
##   GENDER percentage
##   <fct>       <dbl>
## 1 Male         56.1
## 2 Female       43.9

dat %>% 
  group_by(LICENSE) %>% 
  summarise(percentage = round(n() / 500, 2)) %>%
  arrange(desc(percentage))

## # A tibble: 2 × 2
##   LICENSE percentage
##   <fct>        <dbl>
## 1 1             82.0
## 2 0             18.0

dat %>% 
  group_by(REGION) %>% 
  summarise(percentage = round(n() / 500, 2)) %>%
  arrange(desc(percentage))

## # A tibble: 53 × 2
##    REGION percentage
##    <fct>       <dbl>
##  1 28          29.7 
##  2 8            7.64
##  3 46           5.26
##  4 41           4.98
##  5 15           3.2 
##  6 30           3.01
##  7 29           2.97
##  8 3            2.77
##  9 50           2.35
## 10 11           2.34
## # ℹ 43 more rows

dat %>% 
  group_by(PREV_INS) %>% 
  summarise(percentage = round(n() / 500, 2)) %>%
  arrange(desc(percentage))

## # A tibble: 2 × 2
##   PREV_INS percentage
##   <fct>         <dbl>
## 1 0              69.4
## 2 1              30.6

dat %>% 
  group_by(V_AGE) %>% 
  summarise(percentage = round(n() / 500, 2)) %>%
  arrange(desc(percentage))

## # A tibble: 3 × 2
##   V_AGE     percentage
##   <fct>          <dbl>
## 1 1-2 Year       60.1 
## 2 < 1 Year       34.5 
## 3 > 2 Years       5.47

dat %>% 
  group_by(V_DAMAGE) %>% 
  summarise(percentage = round(n() / 500, 2)) %>%
  arrange(desc(percentage))

## # A tibble: 2 × 2
##   V_DAMAGE percentage
##   <fct>         <dbl>
## 1 Yes            66.2
## 2 No             33.8

dat %>% 
  group_by(CHANNEL) %>% 
  summarise(percentage = round(n() / 500, 2)) %>%
  arrange(desc(percentage))

## # A tibble: 129 × 2
##    CHANNEL percentage
##    <fct>        <dbl>
##  1 152          26.9 
##  2 26           24.1 
##  3 124          23.0 
##  4 160           4.19
##  5 156           3.86
##  6 122           2.8 
##  7 157           2.69
##  8 154           2.28
##  9 163           1.23
## 10 151           0.77
## # ℹ 119 more rows

#Data description We try to show some relationship between 1. Response (RES) and Numerical variables (AGE, PREMIUM, VINTAGE) We’ll use a plot combining boxplot, violin plot, and marginal density plots. 2. Response (RES) and Factor variables (the rest). Just groups of bar graphs or contigency table.

##Response (RES) and Numerical variables (AGE,PREMIUM,VINTAGE) From the results of the Overview chunk: - AGE: We noticed the main age groups: 20-30, 30-60, 60,80, 80+. Consequently, a new feature of age groups will be conducted. Further descriptions based on age of the insured will also be presented below. - PREMIUM: The violin plot suggest 2 customer segments of the company - those who have Annual Premium <10,000 (normally ~2,500) and >10,000 (about 30,000 is most common). Overall, the company gain the average annual premium of approx. 30,000 per client. - VINTAGE: The box plots indicate that on average, the respondants have been with the company for 6 months. Clients usually have contract duration of 1 year. The patterns of outliers illustrate a few number of customer renew their contract to even over 16 years. A new VINTAGE variable will be set up with factors <6 months, 6-12 months,…

library(ggplot2)
library(tidyr)
library(patchwork)

long_data <- dat %>%
    select(RES, AGE, PREMIUM, VINTAGE) %>%  
    pivot_longer(cols = c(AGE, PREMIUM, VINTAGE), 
                 names_to = "Variable", 
                 values_to = "Value")

y_limits <- list(
    AGE = c(min(AGE), max(AGE)),      
    PREMIUM = c(min(PREMIUM), 50000), 
    VINTAGE = c(min(VINTAGE), 365*2)    
)

plots <- lapply(names(y_limits), function(var) {
    ggplot(long_data %>% filter(Variable == var), aes(x = RES, y = Value)) +
        geom_violin(aes(colour=RES)) +
        geom_boxplot(width = 0.08, colour = "grey", outlier.size = 0.01, outlier.color = "green") +
        labs(title = paste("Violin and Box Plots of", var, "by Response"),
             x = "Response", y = "Value") +
        theme(legend.position = "bottom") +
        ylim(y_limits[[var]]) +
        theme_bw()   })

overview_plot <- wrap_plots(plots) +
    plot_layout(ncol = 1, heights = c(2, 2, 2), guides = "collect") 
    plot_annotation(title = "Violin and Box Plots by Variable", 
                    theme = theme(plot.title = element_text(size = 8, hjust = 0.5)))

## $title
## [1] "Violin and Box Plots by Variable"
## 
## $subtitle
## list()
## attr(,"class")
## [1] "waiver"
## 
## $caption
## list()
## attr(,"class")
## [1] "waiver"
## 
## $tag_levels
## list()
## attr(,"class")
## [1] "waiver"
## 
## $tag_prefix
## list()
## attr(,"class")
## [1] "waiver"
## 
## $tag_suffix
## list()
## attr(,"class")
## [1] "waiver"
## 
## $tag_sep
## list()
## attr(,"class")
## [1] "waiver"
## 
## $theme
## List of 1
##  $ plot.title:List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : num 8
##   ..$ hjust        : num 0.5
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi FALSE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi FALSE
##  - attr(*, "validate")= logi TRUE
## 
## attr(,"class")
## [1] "plot_annotation"

print(overview_plot)

## Warning: Removed 2166 rows containing non-finite outside the scale range
## (`stat_ydensity()`).

## Warning: Removed 2166 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 1603 rows containing non-finite outside the scale range
## (`stat_ydensity()`).

## Warning: Removed 1603 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Factor Engineering

breaks <- seq(min(AGE), max(AGE) + 5, by = 5)
labels <- paste(breaks[-length(breaks)], breaks[-1] - 1, sep = "-")
labels[length(labels)] <- paste(breaks[length(breaks) - 1], "+", sep = "")
dat$AGE_GRPS <- cut(AGE, breaks = breaks,include.lowest = TRUE, right = FALSE, labels = labels)
attach(dat)

## The following objects are masked _by_ .GlobalEnv:
## 
##     AGE, CHANNEL, GENDER, LICENSE, PREMIUM, PREV_INS, REGION, RES,
##     V_AGE, V_DAMAGE, VINTAGE

## The following objects are masked from dat (pos = 14):
## 
##     AGE, CHANNEL, GENDER, LICENSE, PREMIUM, PREV_INS, REGION, RES,
##     V_AGE, V_DAMAGE, VINTAGE

breaks1 <- c(0,10000,20000,30000,40000,50000,max(PREMIUM))
labels1 <- paste(breaks1[-length(breaks1)], breaks1[-1], sep = "-")
labels1[length(labels1)] <- paste(breaks1[length(breaks1) - 1], "+", sep = "")
dat$PREMIUM_GRPS <- cut(PREMIUM, breaks = breaks1,include.lowest = TRUE, right = FALSE, labels = labels1)
attach(dat)

## The following objects are masked _by_ .GlobalEnv:
## 
##     AGE, CHANNEL, GENDER, LICENSE, PREMIUM, PREV_INS, REGION, RES,
##     V_AGE, V_DAMAGE, VINTAGE

## The following objects are masked from dat (pos = 3):
## 
##     AGE, AGE_GRPS, CHANNEL, GENDER, LICENSE, PREMIUM, PREV_INS, REGION,
##     RES, V_AGE, V_DAMAGE, VINTAGE

## The following objects are masked from dat (pos = 15):
## 
##     AGE, CHANNEL, GENDER, LICENSE, PREMIUM, PREV_INS, REGION, RES,
##     V_AGE, V_DAMAGE, VINTAGE

breaks2 <- c(min(VINTAGE),182,365,1095, max(VINTAGE))
labels2 <- c("< 6 months","6 - 12 months", "12 - 36 months", "> 36 months")
dat$VINTAGE_GRPS <- cut(VINTAGE, breaks = breaks2,include.lowest = TRUE, right = FALSE, labels = labels2)
attach(dat)

## The following objects are masked _by_ .GlobalEnv:
## 
##     AGE, CHANNEL, GENDER, LICENSE, PREMIUM, PREV_INS, REGION, RES,
##     V_AGE, V_DAMAGE, VINTAGE

## The following objects are masked from dat (pos = 3):
## 
##     AGE, AGE_GRPS, CHANNEL, GENDER, LICENSE, PREMIUM, PREMIUM_GRPS,
##     PREV_INS, REGION, RES, V_AGE, V_DAMAGE, VINTAGE

## The following objects are masked from dat (pos = 4):
## 
##     AGE, AGE_GRPS, CHANNEL, GENDER, LICENSE, PREMIUM, PREV_INS, REGION,
##     RES, V_AGE, V_DAMAGE, VINTAGE

## The following objects are masked from dat (pos = 16):
## 
##     AGE, CHANNEL, GENDER, LICENSE, PREMIUM, PREV_INS, REGION, RES,
##     V_AGE, V_DAMAGE, VINTAGE

colSums(is.na(dat))

##       GENDER          AGE      LICENSE       REGION     PREV_INS        V_AGE 
##            0            0            0            0            0            0 
##     V_DAMAGE      PREMIUM      CHANNEL      VINTAGE          RES     AGE_GRPS 
##            0            0            0            0            0            0 
## PREMIUM_GRPS VINTAGE_GRPS 
##            0            0

dat %>% 
  group_by(AGE_GRPS) %>% 
  summarise(percentage = round(n() / 500, 2)) %>%
  arrange(desc(percentage))

## # A tibble: 24 × 2
##    AGE_GRPS percentage
##    <fct>         <dbl>
##  1 20-24         18.2 
##  2 40-44         12.9 
##  3 25-29         12.6 
##  4 45-49         11.4 
##  5 35-39          8.91
##  6 50-54          8.36
##  7 30-34          8.05
##  8 55-59          5.22
##  9 60-64          3.73
## 10 65-69          2.97
## # ℹ 14 more rows

###Response by Age Violin plot #1: Response by Age, no Driving License. No license -> they’ll say No 100%, so just 1 violin plot here. We also observe that mostly are 40 - 45 years old.

library(dplyr)
library(ggExtra)
response.violin0 <- dat %>% 
  filter(LICENSE == "0") %>%
  ggplot(aes(x = RES, y = AGE)) +
  geom_violin(aes(colour=RES)) +
  xlab("") +
  ylab("Age (years)") +
  scale_y_continuous(breaks = seq(20, 180, by = 5)) +
  theme_bw()
response.violin0

Violin plot #2: Response by Age, with Driving License. Most of those said No is young (20-25). Jittered points and the expansion of the violin. Most of those said Yes is middle aged (40-45). However they also contribute to the middle part expansion of the “No” violin plot, indicating a skeptical attitude, which resulted in the boxplot indicating the average age of saying No is around 35.

The density plots on the right hand side show the distribution of age by Respose (Yes/No). They are heavily left skewed, representing outliers (green), which are people with age from 85-90 and above who provided a response.

response.violin1<-dat %>% 
  filter(LICENSE == "1") %>%
  ggplot(aes(x = RES, y = AGE,colour=RES)) +
  geom_violin(aes(colour=RES)) + 
  geom_boxplot(width = 0.05,colour = "grey",outlier.size = 0.01,outlier.color="green") + 
  geom_jitter(aes(colour=RES),size = 0.01, shape = 1, width=0.1,alpha=1/50) +
  xlab("") +
  ylab("Age (years)") +
  scale_y_continuous(breaks = seq(min(AGE), max(AGE), by = 10)) +
  theme_bw() +
  theme(legend.position = "bottom") +
  guides(colour = guide_legend(title="Response")) 
ggMarginal(response.violin1, type = "density", alpha = 0.3, groupFill = TRUE)

## Warning: Computation failed in `stat_density()`.
## Computation failed in `stat_density()`.
## Caused by error in `density.default()`:
## ! non-finite 'from'

## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.

We’ll give a try of getting rid of the outliers (remove observations for ppl aged 84-137) and have another plot. Why 84? Because it’s reasonable in common sense, the outliers are distributed similarly in the 2 plots (their response might not bring ), and max(age) of the test data is 83. New plot below.

response.violin2<-dat %>% 
  filter(LICENSE == "1") %>%
  ggplot(aes(x = RES, y = AGE,colour=RES)) +
  geom_violin(aes(colour=RES)) + 
  geom_boxplot(width = 0.05,colour = "grey",outlier.size = 0.01,outlier.color="green") + 
  geom_jitter(aes(colour=RES),size = 0.01, shape = 1, width=0.1,alpha=1/50) +
  xlab("") +
  ylab("Age (years)") +
  scale_y_continuous(breaks = seq(min(AGE), max(AGE), by = 5)) +
  theme_bw() +
  theme(legend.position = "bottom") +
  ylim(min(AGE),max(test_13$Age)) +
  guides(colour = guide_legend(title="Response"))

## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.

ggMarginal(response.violin2, type = "density", alpha = 0.3, groupFill = TRUE)

## Warning: Removed 1102 rows containing non-finite outside the scale range
## (`stat_ydensity()`).

## Warning: Removed 1102 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 1102 rows containing non-finite outside the scale range
## (`stat_ydensity()`).

## Warning: Removed 1102 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 1399 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Computation failed in `stat_density()`.
## Computation failed in `stat_density()`.
## Caused by error in `density.default()`:
## ! non-finite 'from'

## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.

###Response by Annual Premium The plots indicate 2 customer segments of the company: the <10000 of annual premium and the higher premium group (15000-20000 per year). The similar violin shapes suggest that this variable does not largely affect the response of the customer.

w.head<-dat %>% 
  filter(LICENSE == "0") %>%
  ggplot(aes(x = RES, y = PREMIUM)) +
  geom_violin(aes(colour=RES)) +
  geom_boxplot(width = 0.05,colour = "grey",outlier.size = 0.01,outlier.color="green") +
  xlab("") +
  ylab("") +
  theme_bw() +
  guides(colour = guide_legend(title="Response"))
w.head

response.violin3 <- dat %>% 
  filter(LICENSE == "1") %>%
  ggplot(aes(x = RES, y = PREMIUM)) +
  geom_violin(aes(colour=RES)) +
  geom_boxplot(width = 0.05,colour = "grey",outlier.size = 0.01,outlier.color="green") +
  geom_jitter(aes(colour=RES),size = 0.01, shape = 1, width=0.1,alpha=1/50) +
  xlab("") +
  ylab("") +
  theme_bw() +
  theme(legend.position = "bottom") +
  ylim(min(PREMIUM),max(test_13$Annual_Premium)) + #Cutting the long head
  guides(colour = guide_legend(title="Response"))
response.violin3

## Warning: Removed 1774 rows containing non-finite outside the scale range
## (`stat_ydensity()`).

## Warning: Removed 1774 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 5468 rows containing missing values or values outside the scale range
## (`geom_point()`).

response.violin4_0 <- dat %>% 
  filter(LICENSE == "1") %>%
  ggplot(aes(x = RES, y = PREMIUM)) +
  geom_violin(aes(color=RES)) +
  geom_boxplot(width = 0.05,colour = "grey",outlier.size = 0.01,outlier.color="green") +
  geom_jitter(aes(colour=RES),size = 0.01, shape = 1, width=0.1,alpha=1/50) +
  geom_smooth()+
  xlab("") +
  ylab("Annual Premium") +
  theme_bw()+
  theme(legend.position = "bottom") +
  guides(colour = guide_legend(title="Response"))
response.violin4_0

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

response.violin4<-response.violin4_0+ylim(min(PREMIUM),60000)
ggMarginal(response.violin4, type = "density", alpha = 0.9, groupFill = TRUE)

## Warning: Removed 1773 rows containing non-finite outside the scale range
## (`stat_ydensity()`).

## Warning: Removed 1773 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## Warning: Removed 1773 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 1773 rows containing non-finite outside the scale range
## (`stat_ydensity()`).

## Warning: Removed 1773 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## Warning: Removed 1773 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 5439 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Computation failed in `stat_density()`.
## Computation failed in `stat_density()`.
## Caused by error in `density.default()`:
## ! non-finite 'from'

## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.

###Response by Number of days in the contract Turns out it’s not a very important variable.

dat %>% 
  filter(LICENSE=="0") %>%
  ggplot(aes(x = RES, y = VINTAGE)) +
  geom_violin() +
  geom_boxplot(width = 0.05,colour = "grey",outlier.size = 0.01,outlier.color="green") +
  geom_jitter(aes(colour=RES),size = 0.01, shape = 1, width=0.1,alpha=1/50) +
  xlab("") +
  ylab("Number of days since the insured
started a contract") +
  theme_bw()+
  theme(legend.position = "bottom")

response.violin5_0 <- dat %>% 
  filter(LICENSE=="1") %>%
  ggplot(aes(x = RES, y = VINTAGE)) +
  geom_violin(aes(colour=RES)) +
  geom_boxplot(width = 0.05,colour = "grey",outlier.size = 0.01,outlier.color="green") +
  geom_jitter(aes(colour=RES),size = 0.01, shape = 1, width=0.1,alpha=1/50) +
  xlab("") +
  ylab("Number of days since the insured
started a contract") +
  theme_bw()
response.violin5_0

response.violin5 <- response.violin5_0 + ylim(min(VINTAGE),500) +
  theme_bw()+
  theme(legend.position = "bottom") 
response.violin5

## Warning: Removed 1455 rows containing non-finite outside the scale range
## (`stat_ydensity()`).

## Warning: Removed 1455 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 1510 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggMarginal(response.violin5, type = "density", alpha = 0.9, groupFill = TRUE)

## Warning: Removed 1455 rows containing non-finite outside the scale range
## (`stat_ydensity()`).

## Warning: Removed 1455 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 1455 rows containing non-finite outside the scale range
## (`stat_ydensity()`).

## Warning: Removed 1455 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 1528 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Computation failed in `stat_density()`.
## Computation failed in `stat_density()`.
## Caused by error in `density.default()`:
## ! non-finite 'from'

## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.

##Response by Categorical Variables

plot1 <-dat %>% 
  group_by(CHANNEL,RES) %>% 
  summarise(count=n(), .groups = 'drop') %>% 
  arrange(desc(count)) %>%
  slice_head(n = 10) %>%
  ggplot(aes(x = CHANNEL, y = count, fill = RES)) +
  geom_bar(stat = "identity") +
  labs(title = "Response by Policy Sales Channel",
       x = "",
       y = "Count",
       fill = "Response") +
  theme_minimal() +
  scale_fill_manual(values = c("0" = "lightblue", "1" = "salmon"))
plot1

plot2 <-dat %>% 
  group_by(REGION,RES) %>% 
  summarise(count=n(), .groups = 'drop') %>% 
  arrange(desc(count)) %>%
  slice_head(n = 10) %>%
  ggplot(aes(x = reorder(REGION, -count), y = count, fill = RES)) +
  geom_bar(stat = "identity") +
  labs(title = "Response by Region Code",
       x = "",
       y = "Count",
       fill = "Response") +
  theme_minimal() +
  scale_fill_manual(values = c("0" = "lightblue", "1" = "salmon"))
plot2

plot3 <-dat %>% 
  group_by(AGE_GRPS,RES) %>% 
  summarise(count=n(), .groups = 'drop') %>% 
  arrange(desc(count)) %>%
  ggplot(aes(x = reorder(AGE_GRPS, -count), y = count, fill = RES)) +
  geom_bar(stat = "identity") +
  labs(title = "Response by Age Group",
       x = "",
       y = "Count",
       fill = "Response") +
  theme_minimal() +
  scale_fill_manual(values = c("0" = "lightblue", "1" = "salmon"))
plot3

plot4 <-dat %>% 
  group_by(VINTAGE_GRPS,RES) %>% 
  summarise(count=n(), .groups = 'drop') %>% 
  arrange(desc(count)) %>%
  ggplot(aes(x = reorder(VINTAGE_GRPS, -count), y = count, fill = RES)) +
  geom_bar(stat = "identity") +
  labs(title = "Response by Days of loyalty",
       x = "",
       y = "Count",
       fill = "Response") +
  theme_minimal() +
  scale_fill_manual(values = c("0" = "lightblue", "1" = "salmon"))
plot4

plot5 <-dat %>% 
  group_by(V_AGE,RES) %>% 
  summarise(count=n(), .groups = 'drop') %>% 
  arrange(desc(count)) %>%
  ggplot(aes(x = reorder(V_AGE, -count), y = count, fill = RES)) +
  geom_bar(stat = "identity") +
  labs(title = "Response by Vehicle Age",
       x = "",
       y = "Count",
       fill = "Response") +
  theme_minimal() +
  scale_fill_manual(values = c("0" = "lightblue", "1" = "salmon"))
plot5

plot6 <-dat %>% 
  group_by(PREV_INS,RES) %>% 
  summarise(count=n(), .groups = 'drop') %>% 
  arrange(desc(count)) %>%
  ggplot(aes(x = reorder(PREV_INS, -count), y = count, fill = RES)) +
  geom_bar(stat = "identity") +
  labs(title = "Response by Previously Insured Situation",
       x = "",
       y = "Count",
       fill = "Response") +
  theme_minimal() +
  scale_fill_manual(values = c("0" = "lightblue", "1" = "salmon"))
plot6

plot7 <-dat %>% 
  group_by(PREMIUM_GRPS,RES) %>% 
  summarise(count=n(), .groups = 'drop') %>% 
  arrange(desc(count)) %>%
  ggplot(aes(x = reorder(PREMIUM_GRPS, -count), y = count, fill = RES)) +
  geom_bar(stat = "identity") +
  labs(title = "Response by Premium brackets",
       x = "",
       y = "Count",
       fill = "Response") +
  theme_minimal() +
  scale_fill_manual(values = c("0" = "lightblue", "1" = "salmon"))
plot7

plot8 <-dat %>% 
  group_by(V_DAMAGE,RES) %>% 
  summarise(count=n(), .groups = 'drop') %>% 
  arrange(desc(count)) %>%
  ggplot(aes(x = reorder(V_DAMAGE, -count), y = count, fill = RES)) +
  geom_bar(stat = "identity") +
  labs(title = "The Vehicle has been damaged before or not, by Resonse",
       x = "",
       y = "Count",
       fill = "Response") +
  theme_minimal() +
  scale_fill_manual(values = c("0" = "lightblue", "1" = "salmon"))
plot8

dat %>% 
  group_by(VINTAGE_GRPS,PREMIUM_GRPS,AGE_GRPS) %>% 
  summarise(count=n(), .groups = 'drop') %>% 
  arrange(desc(count))

## # A tibble: 355 × 4
##    VINTAGE_GRPS  PREMIUM_GRPS AGE_GRPS count
##    <fct>         <fct>        <fct>    <int>
##  1 < 6 months    30000-40000  20-24     1781
##  2 < 6 months    20000-30000  20-24     1677
##  3 6 - 12 months 30000-40000  20-24     1294
##  4 < 6 months    30000-40000  45-49     1254
##  5 < 6 months    30000-40000  40-44     1226
##  6 < 6 months    30000-40000  25-29     1222
##  7 6 - 12 months 20000-30000  20-24     1203
##  8 < 6 months    20000-30000  25-29     1117
##  9 6 - 12 months 30000-40000  40-44      937
## 10 < 6 months    20000-30000  40-44      854
## # ℹ 345 more rows

m=glm(RES ~ GENDER + Age + Driving_License + Region_Code + Policy_Sales_Channel + Previously_Insured + Vehicle_Age + Vehicle_Damage + Annual_Premium + Vintage, data = train_13, family = binomial()) summary(m) cook1= cooks.distance(m) large_cd= cook1>4/length(cook1) length(cook1[large_cd])

#ML in Python

if (reticulate::py_available()) message("Python 3 found.")
if (reticulate::py_module_available("pandas")) message("'pandas' found.")

## 'pandas' found.

if (reticulate::py_module_available("matplotlib")) message("'matplotlib' found.")

## 'matplotlib' found.

if (reticulate::py_module_available("numpy")) message("'numpy' found.")

## 'numpy' found.

if (reticulate::py_module_available("xgboost")) message("'xgb' found.")

## 'xgb' found.

#!pip install matplotlib
#!pip install pandas
#!pip install numpy
#!pip3 install xgboost

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import xgboost as xgb

df=r.train_13
df.info

## <bound method DataFrame.info of           id  Gender   Age  ... Policy_Sales_Channel Vintage Response
## 0          1  Female  24.0  ...                  152   292.0        0
## 1          2    Male  48.0  ...                   26   115.0        0
## 2          3  Female  71.0  ...                  163   113.0        1
## 3          4  Female  74.0  ...                  152   156.0        0
## 4          5    Male  43.0  ...                  124   200.0        0
## ...      ...     ...   ...  ...                  ...     ...      ...
## 49995  49996    Male  24.0  ...                   26   234.0        0
## 49996  49997    Male  50.0  ...                   26   208.0        0
## 49997  49998    Male  25.0  ...                  152   191.0        1
## 49998  49999    Male  35.0  ...                  156   202.0        0
## 49999  50000    Male  65.0  ...                   30   105.0        0
## 
## [50000 rows x 12 columns]>

df_test=r.test_13
df_test

##           id  Gender   Age  ... Annual_Premium Policy_Sales_Channel Vintage
## 0          1    Male  44.0  ...        40454.0                   26   217.0
## 1          2  Female  60.0  ...        32363.0                  124   102.0
## 2          3    Male  30.0  ...        24550.0                  124    45.0
## 3          4  Female  26.0  ...        31136.0                  152   186.0
## 4          5  Female  29.0  ...        32923.0                  152    34.0
## ...      ...     ...   ...  ...            ...                  ...     ...
## 29995  29996    Male  33.0  ...        49455.0                  122   198.0
## 29996  29997    Male  41.0  ...        42721.0                   54   132.0
## 29997  29998    Male  25.0  ...        35369.0                  152    43.0
## 29998  29999    Male  50.0  ...        43214.0                    7    48.0
## 29999  30000  Female  34.0  ...         2630.0                  124   208.0
## 
## [30000 rows x 11 columns]

df_onehot = pd.get_dummies(df,  dtype='int')
df_onehot

##         Age  Annual_Premium  ...  Response_0  Response_1
## 0      24.0         21795.0  ...           1           0
## 1      48.0         28274.0  ...           1           0
## 2      71.0         40297.0  ...           0           1
## 3      74.0         37214.0  ...           1           0
## 4      43.0       2575100.0  ...           1           0
## ...     ...             ...  ...         ...         ...
## 49995  24.0         37012.0  ...           1           0
## 49996  50.0         43065.0  ...           1           0
## 49997  25.0         36435.0  ...           0           1
## 49998  35.0         26490.0  ...           1           0
## 49999  65.0          2630.0  ...           1           0
## 
## [50000 rows x 50198 columns]

column_to_move = df_onehot.pop(“Response”) # insert column with insert(location, column_name, column_value) df_onehot.insert(len(df_onehot.columns), “Response”, column_to_move) df_onehot ```

DS

2024-11-25

Factor Engineering