DS

Data description tells us several things: * the business situation: contracts are typically 6 months - 1 year type, no surprise in the auto insurance industry.

customer segment (many young customers, then middle aged, then really old aged customers)

revenue (young clients many and offered lower premium, middle aged are less in number but provide the main income source for the company)

customer service/customer loyalty (they said no after they are reimbursed or they didnt wish to renew their contracts after 1 year).

overall, those who have vehicles who have been damaged before said yes, otherwise they likely said no, but we had some features to target anyway.

the picture of our possible targeted customer: middle aged (35-50), contract has duration <1 year, new vehicle, channel 152, region 28.

#Initial steps

library(readr)
train_13 <- read_delim("train_13.csv", delim = ";", escape_double = FALSE,
                                       col_types = cols(id = col_character(),
                                                  Gender = col_factor(),
                                                  Driving_License = col_factor(), 
                                                  Region_Code = col_factor(),
                                                  Previously_Insured = col_factor(),
                                                  Vehicle_Age = col_factor(),
                                                  Policy_Sales_Channel = col_factor(),
                                                  Vehicle_Damage = col_factor(),
                                                  Response = col_factor()),
                                       na = "NA", trim_ws = TRUE)
attach(train_13)
str(train_13)

## spc_tbl_ [50,000 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ id                  : chr [1:50000] "1" "2" "3" "4" ...
##  $ Gender              : Factor w/ 2 levels "Female","Male": 1 2 1 1 2 2 2 2 1 2 ...
##  $ Age                 : num [1:50000] 24 48 71 74 43 44 49 31 21 60 ...
##  $ Driving_License     : Factor w/ 2 levels "1","0": 1 1 1 1 1 2 1 1 1 1 ...
##  $ Region_Code         : Factor w/ 53 levels "40","16","35",..: 1 2 3 4 5 6 5 7 8 8 ...
##  $ Previously_Insured  : Factor w/ 2 levels "1","0": 1 2 2 2 2 2 2 1 1 2 ...
##  $ Vehicle_Age         : Factor w/ 3 levels "< 1 Year","1-2 Year",..: 1 2 2 1 2 2 2 1 1 3 ...
##  $ Vehicle_Damage      : Factor w/ 2 levels "No","Yes": 1 2 2 2 2 2 2 1 1 2 ...
##  $ Annual_Premium      : num [1:50000] 21795 28274 40297 37214 2575100 ...
##  $ Policy_Sales_Channel: Factor w/ 129 levels "152","26","163",..: 1 2 3 1 4 3 2 1 5 4 ...
##  $ Vintage             : num [1:50000] 292 115 113 156 200 118 247 33 155 122 ...
##  $ Response            : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 2 1 1 2 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   id = col_character(),
##   ..   Gender = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   Age = col_double(),
##   ..   Driving_License = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   Region_Code = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   Previously_Insured = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   Vehicle_Age = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   Vehicle_Damage = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   Annual_Premium = col_double(),
##   ..   Policy_Sales_Channel = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   Vintage = col_double(),
##   ..   Response = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE)
##   .. )
##  - attr(*, "problems")=<externalptr>

GENDER <- train_13$Gender
AGE <- train_13$Age
LICENSE <- train_13$Driving_License
REGION <- train_13$Region_Code
PREV_INS <- train_13$Previously_Insured
V_AGE <- train_13$Vehicle_Age
V_DAMAGE <- train_13$Vehicle_Damage
PREMIUM <- train_13$Annual_Premium
CHANNEL <- train_13$Policy_Sales_Channel
VINTAGE <- train_13$Vintage
RES <- train_13$Response

dat <- data.frame(
  GENDER,
  AGE,
  LICENSE,
  REGION,
  PREV_INS,
  V_AGE,
  V_DAMAGE,
  PREMIUM,
  CHANNEL,
  VINTAGE,
  RES
)
attach(dat)

## The following objects are masked _by_ .GlobalEnv:
## 
##     AGE, CHANNEL, GENDER, LICENSE, PREMIUM, PREV_INS, REGION, RES,
##     V_AGE, V_DAMAGE, VINTAGE

head(dat)

##   GENDER AGE LICENSE REGION PREV_INS    V_AGE V_DAMAGE PREMIUM CHANNEL VINTAGE
## 1 Female  24       1     40        1 < 1 Year       No   21795     152     292
## 2   Male  48       1     16        0 1-2 Year      Yes   28274      26     115
## 3 Female  71       1     35        0 1-2 Year      Yes   40297     163     113
## 4 Female  74       1      8        0 < 1 Year      Yes   37214     152     156
## 5   Male  43       1     28        0 1-2 Year      Yes 2575100     124     200
## 6   Male  44       0     31        0 1-2 Year      Yes    2630     163     118
##   RES
## 1   0
## 2   0
## 3   1
## 4   0
## 5   0
## 6   0

library(readr)
test_13 <- read_delim("test.csv", delim = ",", escape_double = FALSE,
                                       col_types = cols(id = col_character(),
                                                  Gender = col_factor(),
                                                  Driving_License = col_factor(), 
                                                  Region_Code = col_factor(),
                                                  Previously_Insured = col_factor(),
                                                  Vehicle_Age = col_factor(),
                                                  Policy_Sales_Channel = col_factor(),
                                                  Vehicle_Damage = col_factor(),
                                                  Response = col_factor()),
                                       na = "NA", trim_ws = TRUE)

## Warning: The following named parsers don't match the column names: Response

str(test_13)

## spc_tbl_ [30,000 × 11] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ id                  : chr [1:30000] "1" "2" "3" "4" ...
##  $ Gender              : Factor w/ 2 levels "Male","Female": 1 2 1 2 2 1 2 2 1 2 ...
##  $ Age                 : num [1:30000] 44 60 30 26 29 48 25 22 27 32 ...
##  $ Driving_License     : Factor w/ 2 levels "1","0": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Region_Code         : Factor w/ 53 levels "28","33","30",..: 1 2 3 3 4 5 6 7 7 8 ...
##  $ Previously_Insured  : Factor w/ 2 levels "0","1": 1 1 1 2 2 1 2 2 2 1 ...
##  $ Vehicle_Age         : Factor w/ 3 levels "> 2 Years","1-2 Year",..: 1 2 3 3 3 2 3 3 3 2 ...
##  $ Vehicle_Damage      : Factor w/ 2 levels "Yes","No": 1 1 1 2 2 2 2 2 2 1 ...
##  $ Annual_Premium      : num [1:30000] 40454 32363 24550 31136 32923 ...
##  $ Policy_Sales_Channel: Factor w/ 116 levels "26","124","152",..: 1 2 2 3 3 2 3 3 4 1 ...
##  $ Vintage             : num [1:30000] 217 102 45 186 34 246 62 156 77 166 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   id = col_character(),
##   ..   Gender = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   Age = col_double(),
##   ..   Driving_License = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   Region_Code = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   Previously_Insured = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   Vehicle_Age = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   Vehicle_Damage = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   Annual_Premium = col_double(),
##   ..   Policy_Sales_Channel = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   Vintage = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

colSums(is.na(test_13))

##                   id               Gender                  Age 
##                    0                    0                    0 
##      Driving_License          Region_Code   Previously_Insured 
##                    0                    0                    0 
##          Vehicle_Age       Vehicle_Damage       Annual_Premium 
##                    0                    0                    0 
## Policy_Sales_Channel              Vintage 
##                    0                    0

colSums(is.na(train_13))

##                   id               Gender                  Age 
##                    0                    0                    0 
##      Driving_License          Region_Code   Previously_Insured 
##                    0                    0                    0 
##          Vehicle_Age       Vehicle_Damage       Annual_Premium 
##                    0                    0                    0 
## Policy_Sales_Channel              Vintage             Response 
##                    0                    0                    0

colSums(is.na(dat))

##   GENDER      AGE  LICENSE   REGION PREV_INS    V_AGE V_DAMAGE  PREMIUM 
##        0        0        0        0        0        0        0        0 
##  CHANNEL  VINTAGE      RES 
##        0        0        0

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ purrr     1.0.2
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(reprex)
library(purrr)
library(dplyr)
summary(dat)

##     GENDER           AGE         LICENSE       REGION      PREV_INS 
##  Female:21934   Min.   : 20.00   1:41016   28     :14847   1:15275  
##  Male  :28066   1st Qu.: 27.00   0: 8984   8      : 3822   0:34725  
##                 Median : 40.00             46     : 2631            
##                 Mean   : 42.19             41     : 2488            
##                 3rd Qu.: 51.00             15     : 1599            
##                 Max.   :137.00             30     : 1503            
##                                            (Other):23110            
##        V_AGE       V_DAMAGE       PREMIUM           CHANNEL         VINTAGE    
##  < 1 Year :17229   No :16873   Min.   :   2630   152    :13440   Min.   :  10  
##  1-2 Year :30035   Yes:33127   1st Qu.:  23939   26     :12057   1st Qu.:  85  
##  > 2 Years: 2736               Median :  31462   124    :11518   Median : 161  
##                                Mean   : 145985   160    : 2096   Mean   : 223  
##                                3rd Qu.:  38665   156    : 1932   3rd Qu.: 236  
##                                Max.   :4997200   122    : 1402   Max.   :5960  
##                                                  (Other): 7555                 
##  RES      
##  0:37773  
##  1:12227  
##           
##           
##           
##           
##

dat %>% 
  group_by(GENDER) %>% 
  summarise(percentage = round(n() / 500, 2)) %>%
  arrange(desc(percentage))

## # A tibble: 2 × 2
##   GENDER percentage
##   <fct>       <dbl>
## 1 Male         56.1
## 2 Female       43.9

dat %>% 
  group_by(LICENSE) %>% 
  summarise(percentage = round(n() / 500, 2)) %>%
  arrange(desc(percentage))

## # A tibble: 2 × 2
##   LICENSE percentage
##   <fct>        <dbl>
## 1 1             82.0
## 2 0             18.0

dat %>% 
  group_by(REGION) %>% 
  summarise(percentage = round(n() / 500, 2)) %>%
  arrange(desc(percentage))

## # A tibble: 53 × 2
##    REGION percentage
##    <fct>       <dbl>
##  1 28          29.7 
##  2 8            7.64
##  3 46           5.26
##  4 41           4.98
##  5 15           3.2 
##  6 30           3.01
##  7 29           2.97
##  8 3            2.77
##  9 50           2.35
## 10 11           2.34
## # ℹ 43 more rows

dat %>% 
  group_by(PREV_INS) %>% 
  summarise(percentage = round(n() / 500, 2)) %>%
  arrange(desc(percentage))

## # A tibble: 2 × 2
##   PREV_INS percentage
##   <fct>         <dbl>
## 1 0              69.4
## 2 1              30.6

dat %>% 
  group_by(V_AGE) %>% 
  summarise(percentage = round(n() / 500, 2)) %>%
  arrange(desc(percentage))

## # A tibble: 3 × 2
##   V_AGE     percentage
##   <fct>          <dbl>
## 1 1-2 Year       60.1 
## 2 < 1 Year       34.5 
## 3 > 2 Years       5.47

dat %>% 
  group_by(V_DAMAGE) %>% 
  summarise(percentage = round(n() / 500, 2)) %>%
  arrange(desc(percentage))

## # A tibble: 2 × 2
##   V_DAMAGE percentage
##   <fct>         <dbl>
## 1 Yes            66.2
## 2 No             33.8

dat %>% 
  group_by(CHANNEL) %>% 
  summarise(percentage = round(n() / 500, 2)) %>%
  arrange(desc(percentage))

## # A tibble: 129 × 2
##    CHANNEL percentage
##    <fct>        <dbl>
##  1 152          26.9 
##  2 26           24.1 
##  3 124          23.0 
##  4 160           4.19
##  5 156           3.86
##  6 122           2.8 
##  7 157           2.69
##  8 154           2.28
##  9 163           1.23
## 10 151           0.77
## # ℹ 119 more rows

#Data description We try to show some relationship between 1. Response (RES) and Numerical variables (AGE, PREMIUM, VINTAGE) We’ll use a plot combining boxplot, violin plot, and marginal density plots. 2. Response (RES) and Factor variables (the rest). Just groups of bar graphs or contigency table.

##Response (RES) and Numerical variables (AGE,PREMIUM,VINTAGE) From the results of the Overview chunk: - AGE: We noticed the main age groups: 20-30, 30-60, 60,80, 80+. Consequently, a new feature of age groups will be conducted. Further descriptions based on age of the insured will also be presented below. - PREMIUM: The violin plot suggest 2 customer segments of the company - those who have Annual Premium <10,000 (normally ~2,500) and >10,000 (about 30,000 is most common). Overall, the company gain the average annual premium of approx. 30,000 per client. - VINTAGE: The box plots indicate that on average, the respondants have been with the company for 6 months. Clients usually have contract duration of 1 year. The patterns of outliers illustrate a few number of customer renew their contract to even over 16 years. A new VINTAGE variable will be set up with factors <6 months, 6-12 months,…

library(ggplot2)
library(tidyr)
library(patchwork)

long_data <- dat %>%
    select(RES, AGE, PREMIUM, VINTAGE) %>%  
    pivot_longer(cols = c(AGE, PREMIUM, VINTAGE), 
                 names_to = "Variable", 
                 values_to = "Value")

y_limits <- list(
    AGE = c(min(AGE), max(AGE)),      
    PREMIUM = c(min(PREMIUM), 50000), 
    VINTAGE = c(min(VINTAGE), 365*2)    
)

plots <- lapply(names(y_limits), function(var) {
    ggplot(long_data %>% filter(Variable == var), aes(x = RES, y = Value)) +
        geom_violin(aes(colour=RES)) +
        geom_boxplot(width = 0.08, colour = "grey", outlier.size = 0.01, outlier.color = "green") +
        labs(title = paste("Violin and Box Plots of", var, "by Response"),
             x = "Response", y = "Value") +
        theme(legend.position = "bottom") +
        ylim(y_limits[[var]]) +
        theme_bw()   })

overview_plot <- wrap_plots(plots) +
    plot_layout(ncol = 1, heights = c(2, 2, 2), guides = "collect") 
    plot_annotation(title = "Violin and Box Plots by Variable", 
                    theme = theme(plot.title = element_text(size = 8, hjust = 0.5)))

## $title
## [1] "Violin and Box Plots by Variable"
## 
## $subtitle
## list()
## attr(,"class")
## [1] "waiver"
## 
## $caption
## list()
## attr(,"class")
## [1] "waiver"
## 
## $tag_levels
## list()
## attr(,"class")
## [1] "waiver"
## 
## $tag_prefix
## list()
## attr(,"class")
## [1] "waiver"
## 
## $tag_suffix
## list()
## attr(,"class")
## [1] "waiver"
## 
## $tag_sep
## list()
## attr(,"class")
## [1] "waiver"
## 
## $theme
## List of 1
##  $ plot.title:List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : num 8
##   ..$ hjust        : num 0.5
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi FALSE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi FALSE
##  - attr(*, "validate")= logi TRUE
## 
## attr(,"class")
## [1] "plot_annotation"

print(overview_plot)

## Warning: Removed 2166 rows containing non-finite outside the scale range
## (`stat_ydensity()`).

## Warning: Removed 2166 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 1603 rows containing non-finite outside the scale range
## (`stat_ydensity()`).

## Warning: Removed 1603 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Factor Engineering

breaks <- seq(min(AGE), max(AGE) + 5, by = 5)
labels <- paste(breaks[-length(breaks)], breaks[-1] - 1, sep = "-")
labels[length(labels)] <- paste(breaks[length(breaks) - 1], "+", sep = "")
dat$AGE_GRPS <- cut(AGE, breaks = breaks,include.lowest = TRUE, right = FALSE, labels = labels)
attach(dat)

## The following objects are masked _by_ .GlobalEnv:
## 
##     AGE, CHANNEL, GENDER, LICENSE, PREMIUM, PREV_INS, REGION, RES,
##     V_AGE, V_DAMAGE, VINTAGE

## The following objects are masked from dat (pos = 14):
## 
##     AGE, CHANNEL, GENDER, LICENSE, PREMIUM, PREV_INS, REGION, RES,
##     V_AGE, V_DAMAGE, VINTAGE

breaks1 <- c(0,10000,20000,30000,40000,50000,max(PREMIUM))
labels1 <- paste(breaks1[-length(breaks1)], breaks1[-1], sep = "-")
labels1[length(labels1)] <- paste(breaks1[length(breaks1) - 1], "+", sep = "")
dat$PREMIUM_GRPS <- cut(PREMIUM, breaks = breaks1,include.lowest = TRUE, right = FALSE, labels = labels1)
attach(dat)

## The following objects are masked _by_ .GlobalEnv:
## 
##     AGE, CHANNEL, GENDER, LICENSE, PREMIUM, PREV_INS, REGION, RES,
##     V_AGE, V_DAMAGE, VINTAGE

## The following objects are masked from dat (pos = 3):
## 
##     AGE, AGE_GRPS, CHANNEL, GENDER, LICENSE, PREMIUM, PREV_INS, REGION,
##     RES, V_AGE, V_DAMAGE, VINTAGE

## The following objects are masked from dat (pos = 15):
## 
##     AGE, CHANNEL, GENDER, LICENSE, PREMIUM, PREV_INS, REGION, RES,
##     V_AGE, V_DAMAGE, VINTAGE

breaks2 <- c(min(VINTAGE),182,365,1095, max(VINTAGE))
labels2 <- c("< 6 months","6 - 12 months", "12 - 36 months", "> 36 months")
dat$VINTAGE_GRPS <- cut(VINTAGE, breaks = breaks2,include.lowest = TRUE, right = FALSE, labels = labels2)
attach(dat)

## The following objects are masked _by_ .GlobalEnv:
## 
##     AGE, CHANNEL, GENDER, LICENSE, PREMIUM, PREV_INS, REGION, RES,
##     V_AGE, V_DAMAGE, VINTAGE

## The following objects are masked from dat (pos = 3):
## 
##     AGE, AGE_GRPS, CHANNEL, GENDER, LICENSE, PREMIUM, PREMIUM_GRPS,
##     PREV_INS, REGION, RES, V_AGE, V_DAMAGE, VINTAGE

## The following objects are masked from dat (pos = 4):
## 
##     AGE, AGE_GRPS, CHANNEL, GENDER, LICENSE, PREMIUM, PREV_INS, REGION,
##     RES, V_AGE, V_DAMAGE, VINTAGE

## The following objects are masked from dat (pos = 16):
## 
##     AGE, CHANNEL, GENDER, LICENSE, PREMIUM, PREV_INS, REGION, RES,
##     V_AGE, V_DAMAGE, VINTAGE

colSums(is.na(dat))

##       GENDER          AGE      LICENSE       REGION     PREV_INS        V_AGE 
##            0            0            0            0            0            0 
##     V_DAMAGE      PREMIUM      CHANNEL      VINTAGE          RES     AGE_GRPS 
##            0            0            0            0            0            0 
## PREMIUM_GRPS VINTAGE_GRPS 
##            0            0

dat %>% 
  group_by(AGE_GRPS) %>% 
  summarise(percentage = round(n() / 500, 2)) %>%
  arrange(desc(percentage))

## # A tibble: 24 × 2
##    AGE_GRPS percentage
##    <fct>         <dbl>
##  1 20-24         18.2 
##  2 40-44         12.9 
##  3 25-29         12.6 
##  4 45-49         11.4 
##  5 35-39          8.91
##  6 50-54          8.36
##  7 30-34          8.05
##  8 55-59          5.22
##  9 60-64          3.73
## 10 65-69          2.97
## # ℹ 14 more rows

###Response by Age Violin plot #1: Response by Age, no Driving License. No license -> they’ll say No 100%, so just 1 violin plot here. We also observe that mostly are 40 - 45 years old.

library(dplyr)
library(ggExtra)
response.violin0 <- dat %>% 
  filter(LICENSE == "0") %>%
  ggplot(aes(x = RES, y = AGE)) +
  geom_violin(aes(colour=RES)) +
  xlab("") +
  ylab("Age (years)") +
  scale_y_continuous(breaks = seq(20, max(AGE), by = 5)) +
  theme_bw() +
  theme(legend.position = "none") 
print(response.violin0)

Violin plot #2: Response by Age, with Driving License. Most of those said No is young (20-25). Jittered points and the expansion of the violin. Most of those said Yes is middle aged (40-45). However they also contribute to the middle part expansion of the “No” violin plot, indicating a skeptical attitude, which resulted in the boxplot indicating the average age of saying No is around 35.

The density plots on the right hand side show the distribution of age by Respose (Yes/No). They are heavily left skewed, representing outliers (green), which are people with age from 85-90 and above who provided a response.

response.violin1<-dat %>% 
  filter(LICENSE == "1") %>%
  ggplot(aes(x = RES, y = AGE,colour=RES)) +
  geom_violin(aes(colour=RES)) + 
  geom_boxplot(width = 0.05,colour = "grey",outlier.size = 0.01,outlier.color="green") + 
  geom_jitter(aes(colour=RES),size = 0.01, shape = 1, width=0.1,alpha=1/50) +
  xlab("") +
  ylab("Age (years)") +
  scale_y_continuous(breaks = seq(min(AGE), max(AGE), by = 10)) +
  theme_bw() +
  theme(legend.position = "bottom") +
  guides(colour = guide_legend(title="Response")) 
ggMarginal(response.violin1, type = "density", alpha = 0.3, groupFill = TRUE)

## Warning: Computation failed in `stat_density()`.
## Computation failed in `stat_density()`.
## Caused by error in `density.default()`:
## ! non-finite 'from'

## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.

print(response.violin1)

We’ll give a try of getting rid of the outliers (remove observations for ppl aged 84-137) and have another plot. Why 84? Because it’s reasonable in common sense, the outliers are distributed similarly in the 2 plots (their response might not bring ), and max(age) of the test data is 83. New plot below.

response.violin2<-dat %>% 
  filter(LICENSE == "1") %>%
  ggplot(aes(x = RES, y = AGE,colour=RES)) +
  geom_violin(aes(colour=RES)) + 
  geom_boxplot(width = 0.05,colour = "grey",outlier.size = 0.01,outlier.color="green") + 
  geom_jitter(aes(colour=RES),size = 0.01, shape = 1, width=0.1,alpha=1/50) +
  xlab("") +
  ylab("Age (years)") +
  scale_y_continuous(breaks = seq(min(AGE), max(AGE), by = 5)) +
  theme_bw() +
  theme(legend.position = "bottom") +
  ylim(min(AGE),max(test_13$Age)) +
  guides(colour = guide_legend(title="Response"))

## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.

ggMarginal(response.violin2, type = "density", alpha = 0.3, groupFill = TRUE)

## Warning: Removed 1102 rows containing non-finite outside the scale range
## (`stat_ydensity()`).

## Warning: Removed 1102 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 1102 rows containing non-finite outside the scale range
## (`stat_ydensity()`).

## Warning: Removed 1102 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 1410 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Computation failed in `stat_density()`.
## Computation failed in `stat_density()`.
## Caused by error in `density.default()`:
## ! non-finite 'from'

## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.

print(response.violin2)

## Warning: Removed 1102 rows containing non-finite outside the scale range
## (`stat_ydensity()`).

## Warning: Removed 1102 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 1395 rows containing missing values or values outside the scale range
## (`geom_point()`).

###Response by Annual Premium The plots indicate 2 customer segments of the company: the <10000 of annual premium and the higher premium group (15000-20000 per year). The similar violin shapes suggest that this variable does not largely affect the response of the customer.

w.head<-dat %>% 
  filter(LICENSE == "0") %>%
  ggplot(aes(x = RES, y = PREMIUM)) +
  geom_violin(aes(colour=RES)) +
  geom_boxplot(width = 0.05,colour = "grey",outlier.size = 0.01,outlier.color="green") +
  xlab("") +
  ylab("") +
  theme_bw() +
  guides(colour = guide_legend(title="Response"))
w.head

response.violin3 <- dat %>% 
  filter(LICENSE == "1") %>%
  ggplot(aes(x = RES, y = PREMIUM)) +
  geom_violin(aes(colour=RES)) +
  geom_boxplot(width = 0.05,colour = "grey",outlier.size = 0.01,outlier.color="green") +
  geom_jitter(aes(colour=RES),size = 0.01, shape = 1, width=0.1,alpha=1/50) +
  xlab("") +
  ylab("") +
  theme_bw() +
  theme(legend.position = "bottom") +
  ylim(min(PREMIUM),max(test_13$Annual_Premium)) + #Cutting the long head
  guides(colour = guide_legend(title="Response"))
response.violin3

## Warning: Removed 1774 rows containing non-finite outside the scale range
## (`stat_ydensity()`).

## Warning: Removed 1774 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 5486 rows containing missing values or values outside the scale range
## (`geom_point()`).

print(response.violin3)

## Warning: Removed 1774 rows containing non-finite outside the scale range
## (`stat_ydensity()`).

## Warning: Removed 1774 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 5393 rows containing missing values or values outside the scale range
## (`geom_point()`).

response.violin4_0 <- dat %>% 
  filter(LICENSE == "1") %>%
  ggplot(aes(x = RES, y = PREMIUM)) +
  geom_violin(aes(color=RES)) +
  geom_boxplot(width = 0.05,colour = "grey",outlier.size = 0.01,outlier.color="green") +
  geom_jitter(aes(colour=RES),size = 0.01, shape = 1, width=0.1,alpha=1/50) +
  geom_smooth()+
  xlab("") +
  ylab("Annual Premium") +
  theme_bw()+
  theme(legend.position = "bottom") +
  guides(colour = guide_legend(title="Response"))
response.violin4_0

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

response.violin4<-response.violin4_0+ylim(min(PREMIUM),60000)
ggMarginal(response.violin4, type = "density", alpha = 0.9, groupFill = TRUE)

## Warning: Removed 1773 rows containing non-finite outside the scale range
## (`stat_ydensity()`).

## Warning: Removed 1773 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## Warning: Removed 1773 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 1773 rows containing non-finite outside the scale range
## (`stat_ydensity()`).

## Warning: Removed 1773 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## Warning: Removed 1773 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 5473 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Computation failed in `stat_density()`.
## Computation failed in `stat_density()`.
## Caused by error in `density.default()`:
## ! non-finite 'from'

## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.

print(response.violin4)

## Warning: Removed 1773 rows containing non-finite outside the scale range
## (`stat_ydensity()`).

## Warning: Removed 1773 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## Warning: Removed 1773 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 5425 rows containing missing values or values outside the scale range
## (`geom_point()`).

###Response by Number of days in the contract Turns out it’s not a very important variable.

dat %>% 
  filter(LICENSE=="0") %>%
  ggplot(aes(x = RES, y = VINTAGE)) +
  geom_violin() +
  geom_boxplot(width = 0.05,colour = "grey",outlier.size = 0.01,outlier.color="green") +
  geom_jitter(aes(colour=RES),size = 0.01, shape = 1, width=0.1,alpha=1/50) +
  xlab("") +
  ylab("Number of days since the insured
started a contract") +
  theme_bw()+
  theme(legend.position = "bottom")

response.violin5_0 <- dat %>% 
  filter(LICENSE=="1") %>%
  ggplot(aes(x = RES, y = VINTAGE)) +
  geom_violin(aes(colour=RES)) +
  geom_boxplot(width = 0.05,colour = "grey",outlier.size = 0.01,outlier.color="green") +
  geom_jitter(aes(colour=RES),size = 0.01, shape = 1, width=0.1,alpha=1/50) +
  xlab("") +
  ylab("Number of days since the insured
started a contract") +
  theme_bw()
response.violin5_0

response.violin5 <- response.violin5_0 + ylim(min(VINTAGE),500) +
  theme_bw()+
  theme(legend.position = "bottom") 
response.violin5

## Warning: Removed 1455 rows containing non-finite outside the scale range
## (`stat_ydensity()`).

## Warning: Removed 1455 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 1522 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggMarginal(response.violin5, type = "density", alpha = 0.9, groupFill = TRUE)

## Warning: Removed 1455 rows containing non-finite outside the scale range
## (`stat_ydensity()`).

## Warning: Removed 1455 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 1455 rows containing non-finite outside the scale range
## (`stat_ydensity()`).

## Warning: Removed 1455 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 1522 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Computation failed in `stat_density()`.
## Computation failed in `stat_density()`.
## Caused by error in `density.default()`:
## ! non-finite 'from'

## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.

print(response.violin5)

## Warning: Removed 1455 rows containing non-finite outside the scale range
## (`stat_ydensity()`).

## Warning: Removed 1455 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 1515 rows containing missing values or values outside the scale range
## (`geom_point()`).

##Response by Categorical Variables

plot1 <-dat %>% 
  group_by(CHANNEL,RES) %>% 
  summarise(count=n(), .groups = 'drop') %>% 
  arrange(desc(count)) %>%
  slice_head(n = 8) %>%
  ggplot(aes(x = CHANNEL, y = count, fill = RES)) +
  geom_bar(stat = "identity") +
  labs(title = "",
       x = "",
       y = "Count",
       fill = "Response") +
  theme_minimal() +
  scale_fill_manual(values = c("0" = "lightblue", "1" = "salmon"))
plot1

plot2 <-dat %>% 
  group_by(REGION,RES) %>% 
  summarise(count=n(), .groups = 'drop') %>% 
  arrange(desc(count)) %>%
  slice_head(n = 8) %>%
  ggplot(aes(x = reorder(REGION, -count), y = count, fill = RES)) +
  geom_bar(stat = "identity") +
  labs(title = "",
       x = "",
       y = "Count",
       fill = "Response") +
  theme_minimal() +
  scale_fill_manual(values = c("0" = "lightblue", "1" = "salmon"))
plot2

plot3 <-dat %>% 
  group_by(AGE_GRPS,RES) %>% 
  summarise(count=n(), .groups = 'drop') %>% 
  arrange(desc(count)) %>%
  ggplot(aes(x = reorder(AGE_GRPS, -count), y = count, fill = RES)) +
  geom_bar(stat = "identity") +
  labs(title = "",
       x = "",
       y = "Count",
       fill = "Response") +
  theme_minimal() +
  scale_fill_manual(values = c("0" = "lightblue", "1" = "salmon"))
plot3

plot4 <-dat %>% 
  group_by(VINTAGE_GRPS,RES) %>% 
  summarise(count=n(), .groups = 'drop') %>% 
  arrange(desc(count)) %>%
  ggplot(aes(x = reorder(VINTAGE_GRPS, -count), y = count, fill = RES)) +
  geom_bar(stat = "identity") +
  labs(title = "",
       x = "",
       y = "Count",
       fill = "Response") +
  theme_minimal() +
  scale_fill_manual(values = c("0" = "lightblue", "1" = "salmon"))
plot4

plot5 <-dat %>% 
  group_by(V_AGE,RES) %>% 
  summarise(count=n(), .groups = 'drop') %>% 
  arrange(desc(count)) %>%
  ggplot(aes(x = reorder(V_AGE, -count), y = count, fill = RES)) +
  geom_bar(stat = "identity") +
  labs(title = "",
       x = "",
       y = "Count",
       fill = "Response") +
  theme_minimal() +
  scale_fill_manual(values = c("0" = "lightblue", "1" = "salmon"))
plot5

plot6 <-dat %>% 
  group_by(PREV_INS,RES) %>% 
  summarise(count=n(), .groups = 'drop') %>% 
  arrange(desc(count)) %>%
  ggplot(aes(x = reorder(PREV_INS, -count), y = count, fill = RES)) +
  geom_bar(stat = "identity") +
  labs(title = "Response by Previously Insured Situation",
       x = "",
       y = "Count",
       fill = "Response") +
  theme_minimal() +
  scale_fill_manual(values = c("0" = "lightblue", "1" = "salmon"))
plot6

plot7 <-dat %>% 
  group_by(PREMIUM_GRPS,RES) %>% 
  summarise(count=n(), .groups = 'drop') %>% 
  arrange(desc(count)) %>%
  ggplot(aes(x = reorder(PREMIUM_GRPS, -count), y = count, fill = RES)) +
  geom_bar(stat = "identity") +
  labs(title = "",
       x = "",
       y = "Count",
       fill = "Response") +
  theme_minimal() +
  scale_fill_manual(values = c("0" = "lightblue", "1" = "salmon"))
plot7

plot8 <-dat %>% 
  group_by(V_DAMAGE,RES) %>% 
  summarise(count=n(), .groups = 'drop') %>% 
  arrange(desc(count)) %>%
  ggplot(aes(x = reorder(V_DAMAGE, -count), y = count, fill = RES)) +
  geom_bar(stat = "identity") +
  labs(title = "",
       x = "",
       y = "Count",
       fill = "Response") +
  theme_minimal() +
  scale_fill_manual(values = c("0" = "lightblue", "1" = "salmon"))
plot8

plot9 <-dat %>% 
  group_by(PREV_INS,RES) %>% 
  summarise(count=n(), .groups = 'drop') %>% 
  arrange(desc(count)) %>%
  ggplot(aes(x = reorder(PREV_INS, -count), y = count, fill = RES)) +
  geom_bar(stat = "identity") +
  labs(title = "",
       x = "",
       y = "Count",
       fill = "Response") +
  theme_minimal() +
  scale_fill_manual(values = c("0" = "lightblue", "1" = "salmon"))
plot9

plot10 <-dat %>% 
  group_by(LICENSE,RES) %>% 
  summarise(count=n(), .groups = 'drop') %>% 
  arrange(desc(count)) %>%
  ggplot(aes(x = reorder(LICENSE, -count), y = count, fill = RES)) +
  geom_bar(stat = "identity") +
  labs(title = "",
       x = "",
       y = "Count",
       fill = "Response") +
  theme_minimal() +
  scale_fill_manual(values = c("0" = "lightblue", "1" = "salmon"))
plot10

plot11 <-dat %>% 
  group_by(GENDER,RES) %>% 
  summarise(count=n(), .groups = 'drop') %>% 
  arrange(desc(count)) %>%
  ggplot(aes(x = reorder(GENDER, -count), y = count, fill = RES)) +
  geom_bar(stat = "identity") +
  labs(title = "",
       x = "",
       y = "Count",
       fill = "Response") +
  theme_minimal() +
  scale_fill_manual(values = c("0" = "lightblue", "1" = "salmon"))
plot11

dat %>% 
  group_by(VINTAGE_GRPS,PREMIUM_GRPS,AGE_GRPS) %>% 
  summarise(count=n(), .groups = 'drop') %>% 
  arrange(desc(count))

## # A tibble: 355 × 4
##    VINTAGE_GRPS  PREMIUM_GRPS AGE_GRPS count
##    <fct>         <fct>        <fct>    <int>
##  1 < 6 months    30000-40000  20-24     1781
##  2 < 6 months    20000-30000  20-24     1677
##  3 6 - 12 months 30000-40000  20-24     1294
##  4 < 6 months    30000-40000  45-49     1254
##  5 < 6 months    30000-40000  40-44     1226
##  6 < 6 months    30000-40000  25-29     1222
##  7 6 - 12 months 20000-30000  20-24     1203
##  8 < 6 months    20000-30000  25-29     1117
##  9 6 - 12 months 30000-40000  40-44      937
## 10 < 6 months    20000-30000  40-44      854
## # ℹ 345 more rows

#ML in Python

Sys.setenv(RETICULATE_PYTHON = “/usr/local/bin/python3”) RETICULATE_PYTHON=“/usr/bin/python” library(reticulate)

if (reticulate::py_available()) message(“Python 3 found.”) if (reticulate::py_module_available(“pandas”)) message(“‘pandas’ found.”) if (reticulate::py_module_available(“matplotlib”)) message(“‘matplotlib’ found.”) if (reticulate::py_module_available(“numpy”)) message(“‘numpy’ found.”) if (reticulate::py_module_available(“xgboost”)) message(“‘xgb’ found.”)

{python run if the libraries are not available} #!pip install matplotlib #!pip install pandas #!pip install numpy #!pip3 install xgboost

import numpy as np import pandas as pd import matplotlib.pyplot as plt import math import xgboost as xgb

df=r.train_13 df.info df_test=r.test_13 df_test

df_onehot = pd.get_dummies(df, dtype=‘int’) df_onehot

column_to_move = df_onehot.pop(“Response”) # insert column with insert(location, column_name, column_value) df_onehot.insert(len(df_onehot.columns), “Response”, column_to_move) df_onehot

DS

2024-11-25

Factor Engineering