This file is the preprocessing procedure for the data. - cleaning: yes. checked for NA and outliers - feature engineering: yes. created 3 new factor variables
and then data description tells us several things: - the business situation: contracts are typically 6 months - 1 year type, no surprise in the autoinsurance industry.
customer segment (many young customers, then middle aged, then really old aged customers)
revenue (young clients many and offered lower premium, middle aged are less in number but provide the main income source for the company)
customer service/customer loyalty (they said no after they are reimbursed or they didnt wish to renew their contracts after 1 year).
overall, those who have vehicles who have been damaged before said yes, otherwise they likely said no, but we had some features to target anyway.
#Initial steps
library(readr)
train_13 <- read_delim("train_13.csv", delim = ";", escape_double = FALSE,
col_types = cols(id = col_character(),
Gender = col_factor(),
Driving_License = col_factor(),
Region_Code = col_factor(),
Previously_Insured = col_factor(),
Vehicle_Age = col_factor(),
Policy_Sales_Channel = col_factor(),
Vehicle_Damage = col_factor(),
Response = col_factor()),
na = "NA", trim_ws = TRUE)
attach(train_13)
str(train_13)
## spc_tbl_ [50,000 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ id : chr [1:50000] "1" "2" "3" "4" ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 2 1 1 2 2 2 2 1 2 ...
## $ Age : num [1:50000] 24 48 71 74 43 44 49 31 21 60 ...
## $ Driving_License : Factor w/ 2 levels "1","0": 1 1 1 1 1 2 1 1 1 1 ...
## $ Region_Code : Factor w/ 53 levels "40","16","35",..: 1 2 3 4 5 6 5 7 8 8 ...
## $ Previously_Insured : Factor w/ 2 levels "1","0": 1 2 2 2 2 2 2 1 1 2 ...
## $ Vehicle_Age : Factor w/ 3 levels "< 1 Year","1-2 Year",..: 1 2 2 1 2 2 2 1 1 3 ...
## $ Vehicle_Damage : Factor w/ 2 levels "No","Yes": 1 2 2 2 2 2 2 1 1 2 ...
## $ Annual_Premium : num [1:50000] 21795 28274 40297 37214 2575100 ...
## $ Policy_Sales_Channel: Factor w/ 129 levels "152","26","163",..: 1 2 3 1 4 3 2 1 5 4 ...
## $ Vintage : num [1:50000] 292 115 113 156 200 118 247 33 155 122 ...
## $ Response : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 2 1 1 2 ...
## - attr(*, "spec")=
## .. cols(
## .. id = col_character(),
## .. Gender = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
## .. Age = col_double(),
## .. Driving_License = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
## .. Region_Code = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
## .. Previously_Insured = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
## .. Vehicle_Age = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
## .. Vehicle_Damage = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
## .. Annual_Premium = col_double(),
## .. Policy_Sales_Channel = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
## .. Vintage = col_double(),
## .. Response = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE)
## .. )
## - attr(*, "problems")=<externalptr>
GENDER <- train_13$Gender
AGE <- train_13$Age
LICENSE <- train_13$Driving_License
REGION <- train_13$Region_Code
PREV_INS <- train_13$Previously_Insured
V_AGE <- train_13$Vehicle_Age
V_DAMAGE <- train_13$Vehicle_Damage
PREMIUM <- train_13$Annual_Premium
CHANNEL <- train_13$Policy_Sales_Channel
VINTAGE <- train_13$Vintage
RES <- train_13$Response
dat <- data.frame(
GENDER,
AGE,
LICENSE,
REGION,
PREV_INS,
V_AGE,
V_DAMAGE,
PREMIUM,
CHANNEL,
VINTAGE,
RES
)
attach(dat)
## The following objects are masked _by_ .GlobalEnv:
##
## AGE, CHANNEL, GENDER, LICENSE, PREMIUM, PREV_INS, REGION, RES,
## V_AGE, V_DAMAGE, VINTAGE
head(dat)
## GENDER AGE LICENSE REGION PREV_INS V_AGE V_DAMAGE PREMIUM CHANNEL VINTAGE
## 1 Female 24 1 40 1 < 1 Year No 21795 152 292
## 2 Male 48 1 16 0 1-2 Year Yes 28274 26 115
## 3 Female 71 1 35 0 1-2 Year Yes 40297 163 113
## 4 Female 74 1 8 0 < 1 Year Yes 37214 152 156
## 5 Male 43 1 28 0 1-2 Year Yes 2575100 124 200
## 6 Male 44 0 31 0 1-2 Year Yes 2630 163 118
## RES
## 1 0
## 2 0
## 3 1
## 4 0
## 5 0
## 6 0
library(readr)
test_13 <- read_delim("test.csv", delim = ",", escape_double = FALSE,
col_types = cols(id = col_character(),
Gender = col_factor(),
Driving_License = col_factor(),
Region_Code = col_factor(),
Previously_Insured = col_factor(),
Vehicle_Age = col_factor(),
Policy_Sales_Channel = col_factor(),
Vehicle_Damage = col_factor(),
Response = col_factor()),
na = "NA", trim_ws = TRUE)
## Warning: The following named parsers don't match the column names: Response
str(test_13)
## spc_tbl_ [30,000 × 11] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ id : chr [1:30000] "1" "2" "3" "4" ...
## $ Gender : Factor w/ 2 levels "Male","Female": 1 2 1 2 2 1 2 2 1 2 ...
## $ Age : num [1:30000] 44 60 30 26 29 48 25 22 27 32 ...
## $ Driving_License : Factor w/ 2 levels "1","0": 1 1 1 1 1 1 1 1 1 1 ...
## $ Region_Code : Factor w/ 53 levels "28","33","30",..: 1 2 3 3 4 5 6 7 7 8 ...
## $ Previously_Insured : Factor w/ 2 levels "0","1": 1 1 1 2 2 1 2 2 2 1 ...
## $ Vehicle_Age : Factor w/ 3 levels "> 2 Years","1-2 Year",..: 1 2 3 3 3 2 3 3 3 2 ...
## $ Vehicle_Damage : Factor w/ 2 levels "Yes","No": 1 1 1 2 2 2 2 2 2 1 ...
## $ Annual_Premium : num [1:30000] 40454 32363 24550 31136 32923 ...
## $ Policy_Sales_Channel: Factor w/ 116 levels "26","124","152",..: 1 2 2 3 3 2 3 3 4 1 ...
## $ Vintage : num [1:30000] 217 102 45 186 34 246 62 156 77 166 ...
## - attr(*, "spec")=
## .. cols(
## .. id = col_character(),
## .. Gender = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
## .. Age = col_double(),
## .. Driving_License = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
## .. Region_Code = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
## .. Previously_Insured = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
## .. Vehicle_Age = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
## .. Vehicle_Damage = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
## .. Annual_Premium = col_double(),
## .. Policy_Sales_Channel = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
## .. Vintage = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
colSums(is.na(test_13))
## id Gender Age
## 0 0 0
## Driving_License Region_Code Previously_Insured
## 0 0 0
## Vehicle_Age Vehicle_Damage Annual_Premium
## 0 0 0
## Policy_Sales_Channel Vintage
## 0 0
colSums(is.na(train_13))
## id Gender Age
## 0 0 0
## Driving_License Region_Code Previously_Insured
## 0 0 0
## Vehicle_Age Vehicle_Damage Annual_Premium
## 0 0 0
## Policy_Sales_Channel Vintage Response
## 0 0 0
colSums(is.na(dat))
## GENDER AGE LICENSE REGION PREV_INS V_AGE V_DAMAGE PREMIUM
## 0 0 0 0 0 0 0 0
## CHANNEL VINTAGE RES
## 0 0 0
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ purrr 1.0.2
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(reprex)
library(purrr)
library(dplyr)
summary(dat)
## GENDER AGE LICENSE REGION PREV_INS
## Female:21934 Min. : 20.00 1:41016 28 :14847 1:15275
## Male :28066 1st Qu.: 27.00 0: 8984 8 : 3822 0:34725
## Median : 40.00 46 : 2631
## Mean : 42.19 41 : 2488
## 3rd Qu.: 51.00 15 : 1599
## Max. :137.00 30 : 1503
## (Other):23110
## V_AGE V_DAMAGE PREMIUM CHANNEL VINTAGE
## < 1 Year :17229 No :16873 Min. : 2630 152 :13440 Min. : 10
## 1-2 Year :30035 Yes:33127 1st Qu.: 23939 26 :12057 1st Qu.: 85
## > 2 Years: 2736 Median : 31462 124 :11518 Median : 161
## Mean : 145985 160 : 2096 Mean : 223
## 3rd Qu.: 38665 156 : 1932 3rd Qu.: 236
## Max. :4997200 122 : 1402 Max. :5960
## (Other): 7555
## RES
## 0:37773
## 1:12227
##
##
##
##
##
dat %>%
group_by(GENDER) %>%
summarise(percentage = round(n() / 500, 2)) %>%
arrange(desc(percentage))
## # A tibble: 2 × 2
## GENDER percentage
## <fct> <dbl>
## 1 Male 56.1
## 2 Female 43.9
dat %>%
group_by(LICENSE) %>%
summarise(percentage = round(n() / 500, 2)) %>%
arrange(desc(percentage))
## # A tibble: 2 × 2
## LICENSE percentage
## <fct> <dbl>
## 1 1 82.0
## 2 0 18.0
dat %>%
group_by(REGION) %>%
summarise(percentage = round(n() / 500, 2)) %>%
arrange(desc(percentage))
## # A tibble: 53 × 2
## REGION percentage
## <fct> <dbl>
## 1 28 29.7
## 2 8 7.64
## 3 46 5.26
## 4 41 4.98
## 5 15 3.2
## 6 30 3.01
## 7 29 2.97
## 8 3 2.77
## 9 50 2.35
## 10 11 2.34
## # ℹ 43 more rows
dat %>%
group_by(PREV_INS) %>%
summarise(percentage = round(n() / 500, 2)) %>%
arrange(desc(percentage))
## # A tibble: 2 × 2
## PREV_INS percentage
## <fct> <dbl>
## 1 0 69.4
## 2 1 30.6
dat %>%
group_by(V_AGE) %>%
summarise(percentage = round(n() / 500, 2)) %>%
arrange(desc(percentage))
## # A tibble: 3 × 2
## V_AGE percentage
## <fct> <dbl>
## 1 1-2 Year 60.1
## 2 < 1 Year 34.5
## 3 > 2 Years 5.47
dat %>%
group_by(V_DAMAGE) %>%
summarise(percentage = round(n() / 500, 2)) %>%
arrange(desc(percentage))
## # A tibble: 2 × 2
## V_DAMAGE percentage
## <fct> <dbl>
## 1 Yes 66.2
## 2 No 33.8
dat %>%
group_by(CHANNEL) %>%
summarise(percentage = round(n() / 500, 2)) %>%
arrange(desc(percentage))
## # A tibble: 129 × 2
## CHANNEL percentage
## <fct> <dbl>
## 1 152 26.9
## 2 26 24.1
## 3 124 23.0
## 4 160 4.19
## 5 156 3.86
## 6 122 2.8
## 7 157 2.69
## 8 154 2.28
## 9 163 1.23
## 10 151 0.77
## # ℹ 119 more rows
#Data description We try to show some relationship between 1. Response (RES) and Numerical variables (AGE, PREMIUM, VINTAGE) We’ll use a plot combining boxplot, violin plot, and marginal density plots. 2. Response (RES) and Factor variables (the rest). Just groups of bar graphs or contigency table.
##Response (RES) and Numerical variables (AGE,PREMIUM,VINTAGE) From the results of the Overview chunk: - AGE: We noticed the main age groups: 20-30, 30-60, 60,80, 80+. Consequently, a new feature of age groups will be conducted. Further descriptions based on age of the insured will also be presented below. - PREMIUM: The violin plot suggest 2 customer segments of the company - those who have Annual Premium <10,000 (normally ~2,500) and >10,000 (about 30,000 is most common). Overall, the company gain the average annual premium of approx. 30,000 per client. - VINTAGE: The box plots indicate that on average, the respondants have been with the company for 6 months. Clients usually have contract duration of 1 year. The patterns of outliers illustrate a few number of customer renew their contract to even over 16 years. A new VINTAGE variable will be set up with factors <6 months, 6-12 months,…
library(ggplot2)
library(tidyr)
library(patchwork)
long_data <- dat %>%
select(RES, AGE, PREMIUM, VINTAGE) %>%
pivot_longer(cols = c(AGE, PREMIUM, VINTAGE),
names_to = "Variable",
values_to = "Value")
y_limits <- list(
AGE = c(min(AGE), max(AGE)),
PREMIUM = c(min(PREMIUM), 50000),
VINTAGE = c(min(VINTAGE), 365*2)
)
plots <- lapply(names(y_limits), function(var) {
ggplot(long_data %>% filter(Variable == var), aes(x = RES, y = Value)) +
geom_violin(aes(colour=RES)) +
geom_boxplot(width = 0.08, colour = "grey", outlier.size = 0.01, outlier.color = "green") +
labs(title = paste("Violin and Box Plots of", var, "by Response"),
x = "Response", y = "Value") +
theme(legend.position = "bottom") +
ylim(y_limits[[var]]) +
theme_bw() })
overview_plot <- wrap_plots(plots) +
plot_layout(ncol = 1, heights = c(2, 2, 2), guides = "collect")
plot_annotation(title = "Violin and Box Plots by Variable",
theme = theme(plot.title = element_text(size = 8, hjust = 0.5)))
## $title
## [1] "Violin and Box Plots by Variable"
##
## $subtitle
## list()
## attr(,"class")
## [1] "waiver"
##
## $caption
## list()
## attr(,"class")
## [1] "waiver"
##
## $tag_levels
## list()
## attr(,"class")
## [1] "waiver"
##
## $tag_prefix
## list()
## attr(,"class")
## [1] "waiver"
##
## $tag_suffix
## list()
## attr(,"class")
## [1] "waiver"
##
## $tag_sep
## list()
## attr(,"class")
## [1] "waiver"
##
## $theme
## List of 1
## $ plot.title:List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : num 8
## ..$ hjust : num 0.5
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi FALSE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi FALSE
## - attr(*, "validate")= logi TRUE
##
## attr(,"class")
## [1] "plot_annotation"
print(overview_plot)
## Warning: Removed 2166 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
## Warning: Removed 2166 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 1603 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
## Warning: Removed 1603 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
breaks <- seq(min(AGE), max(AGE) + 5, by = 5)
labels <- paste(breaks[-length(breaks)], breaks[-1] - 1, sep = "-")
labels[length(labels)] <- paste(breaks[length(breaks) - 1], "+", sep = "")
dat$AGE_GRPS <- cut(AGE, breaks = breaks,include.lowest = TRUE, right = FALSE, labels = labels)
attach(dat)
## The following objects are masked _by_ .GlobalEnv:
##
## AGE, CHANNEL, GENDER, LICENSE, PREMIUM, PREV_INS, REGION, RES,
## V_AGE, V_DAMAGE, VINTAGE
## The following objects are masked from dat (pos = 14):
##
## AGE, CHANNEL, GENDER, LICENSE, PREMIUM, PREV_INS, REGION, RES,
## V_AGE, V_DAMAGE, VINTAGE
breaks1 <- c(0,10000,20000,30000,40000,50000,max(PREMIUM))
labels1 <- paste(breaks1[-length(breaks1)], breaks1[-1], sep = "-")
labels1[length(labels1)] <- paste(breaks1[length(breaks1) - 1], "+", sep = "")
dat$PREMIUM_GRPS <- cut(PREMIUM, breaks = breaks1,include.lowest = TRUE, right = FALSE, labels = labels1)
attach(dat)
## The following objects are masked _by_ .GlobalEnv:
##
## AGE, CHANNEL, GENDER, LICENSE, PREMIUM, PREV_INS, REGION, RES,
## V_AGE, V_DAMAGE, VINTAGE
## The following objects are masked from dat (pos = 3):
##
## AGE, AGE_GRPS, CHANNEL, GENDER, LICENSE, PREMIUM, PREV_INS, REGION,
## RES, V_AGE, V_DAMAGE, VINTAGE
## The following objects are masked from dat (pos = 15):
##
## AGE, CHANNEL, GENDER, LICENSE, PREMIUM, PREV_INS, REGION, RES,
## V_AGE, V_DAMAGE, VINTAGE
breaks2 <- c(min(VINTAGE),182,365,1095, max(VINTAGE))
labels2 <- c("< 6 months","6 - 12 months", "12 - 36 months", "> 36 months")
dat$VINTAGE_GRPS <- cut(VINTAGE, breaks = breaks2,include.lowest = TRUE, right = FALSE, labels = labels2)
attach(dat)
## The following objects are masked _by_ .GlobalEnv:
##
## AGE, CHANNEL, GENDER, LICENSE, PREMIUM, PREV_INS, REGION, RES,
## V_AGE, V_DAMAGE, VINTAGE
## The following objects are masked from dat (pos = 3):
##
## AGE, AGE_GRPS, CHANNEL, GENDER, LICENSE, PREMIUM, PREMIUM_GRPS,
## PREV_INS, REGION, RES, V_AGE, V_DAMAGE, VINTAGE
## The following objects are masked from dat (pos = 4):
##
## AGE, AGE_GRPS, CHANNEL, GENDER, LICENSE, PREMIUM, PREV_INS, REGION,
## RES, V_AGE, V_DAMAGE, VINTAGE
## The following objects are masked from dat (pos = 16):
##
## AGE, CHANNEL, GENDER, LICENSE, PREMIUM, PREV_INS, REGION, RES,
## V_AGE, V_DAMAGE, VINTAGE
colSums(is.na(dat))
## GENDER AGE LICENSE REGION PREV_INS V_AGE
## 0 0 0 0 0 0
## V_DAMAGE PREMIUM CHANNEL VINTAGE RES AGE_GRPS
## 0 0 0 0 0 0
## PREMIUM_GRPS VINTAGE_GRPS
## 0 0
dat %>%
group_by(AGE_GRPS) %>%
summarise(percentage = round(n() / 500, 2)) %>%
arrange(desc(percentage))
## # A tibble: 24 × 2
## AGE_GRPS percentage
## <fct> <dbl>
## 1 20-24 18.2
## 2 40-44 12.9
## 3 25-29 12.6
## 4 45-49 11.4
## 5 35-39 8.91
## 6 50-54 8.36
## 7 30-34 8.05
## 8 55-59 5.22
## 9 60-64 3.73
## 10 65-69 2.97
## # ℹ 14 more rows
###Response by Age Violin plot #1: Response by Age, no Driving License. No license -> they’ll say No 100%, so just 1 violin plot here. We also observe that mostly are 40 - 45 years old.
library(dplyr)
library(ggExtra)
response.violin0 <- dat %>%
filter(LICENSE == "0") %>%
ggplot(aes(x = RES, y = AGE)) +
geom_violin(aes(colour=RES)) +
xlab("") +
ylab("Age (years)") +
scale_y_continuous(breaks = seq(20, 180, by = 5)) +
theme_bw()
response.violin0

Violin plot #2: Response by Age, with Driving License. Most of those said No is young (20-25). Jittered points and the expansion of the violin. Most of those said Yes is middle aged (40-45). However they also contribute to the middle part expansion of the “No” violin plot, indicating a skeptical attitude, which resulted in the boxplot indicating the average age of saying No is around 35.
The density plots on the right hand side show the distribution of age by Respose (Yes/No). They are heavily left skewed, representing outliers (green), which are people with age from 85-90 and above who provided a response.
response.violin1<-dat %>%
filter(LICENSE == "1") %>%
ggplot(aes(x = RES, y = AGE,colour=RES)) +
geom_violin(aes(colour=RES)) +
geom_boxplot(width = 0.05,colour = "grey",outlier.size = 0.01,outlier.color="green") +
geom_jitter(aes(colour=RES),size = 0.01, shape = 1, width=0.1,alpha=1/50) +
xlab("") +
ylab("Age (years)") +
scale_y_continuous(breaks = seq(min(AGE), max(AGE), by = 10)) +
theme_bw() +
theme(legend.position = "bottom") +
guides(colour = guide_legend(title="Response"))
ggMarginal(response.violin1, type = "density", alpha = 0.3, groupFill = TRUE)
## Warning: Computation failed in `stat_density()`.
## Computation failed in `stat_density()`.
## Caused by error in `density.default()`:
## ! non-finite 'from'
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
We’ll give a try of getting rid of the outliers (remove observations for
ppl aged 84-137) and have another plot. Why 84? Because it’s reasonable
in common sense, the outliers are distributed similarly in the 2 plots
(their response might not bring ), and max(age) of the test data is 83.
New plot below.
response.violin2<-dat %>%
filter(LICENSE == "1") %>%
ggplot(aes(x = RES, y = AGE,colour=RES)) +
geom_violin(aes(colour=RES)) +
geom_boxplot(width = 0.05,colour = "grey",outlier.size = 0.01,outlier.color="green") +
geom_jitter(aes(colour=RES),size = 0.01, shape = 1, width=0.1,alpha=1/50) +
xlab("") +
ylab("Age (years)") +
scale_y_continuous(breaks = seq(min(AGE), max(AGE), by = 5)) +
theme_bw() +
theme(legend.position = "bottom") +
ylim(min(AGE),max(test_13$Age)) +
guides(colour = guide_legend(title="Response"))
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
ggMarginal(response.violin2, type = "density", alpha = 0.3, groupFill = TRUE)
## Warning: Removed 1102 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
## Warning: Removed 1102 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 1102 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
## Warning: Removed 1102 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 1399 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Computation failed in `stat_density()`.
## Computation failed in `stat_density()`.
## Caused by error in `density.default()`:
## ! non-finite 'from'
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.

###Response by Annual Premium The plots indicate 2 customer segments of the company: the <10000 of annual premium and the higher premium group (15000-20000 per year). The similar violin shapes suggest that this variable does not largely affect the response of the customer.
w.head<-dat %>%
filter(LICENSE == "0") %>%
ggplot(aes(x = RES, y = PREMIUM)) +
geom_violin(aes(colour=RES)) +
geom_boxplot(width = 0.05,colour = "grey",outlier.size = 0.01,outlier.color="green") +
xlab("") +
ylab("") +
theme_bw() +
guides(colour = guide_legend(title="Response"))
w.head

response.violin3 <- dat %>%
filter(LICENSE == "1") %>%
ggplot(aes(x = RES, y = PREMIUM)) +
geom_violin(aes(colour=RES)) +
geom_boxplot(width = 0.05,colour = "grey",outlier.size = 0.01,outlier.color="green") +
geom_jitter(aes(colour=RES),size = 0.01, shape = 1, width=0.1,alpha=1/50) +
xlab("") +
ylab("") +
theme_bw() +
theme(legend.position = "bottom") +
ylim(min(PREMIUM),max(test_13$Annual_Premium)) + #Cutting the long head
guides(colour = guide_legend(title="Response"))
response.violin3
## Warning: Removed 1774 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
## Warning: Removed 1774 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 5468 rows containing missing values or values outside the scale range
## (`geom_point()`).

response.violin4_0 <- dat %>%
filter(LICENSE == "1") %>%
ggplot(aes(x = RES, y = PREMIUM)) +
geom_violin(aes(color=RES)) +
geom_boxplot(width = 0.05,colour = "grey",outlier.size = 0.01,outlier.color="green") +
geom_jitter(aes(colour=RES),size = 0.01, shape = 1, width=0.1,alpha=1/50) +
geom_smooth()+
xlab("") +
ylab("Annual Premium") +
theme_bw()+
theme(legend.position = "bottom") +
guides(colour = guide_legend(title="Response"))
response.violin4_0
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
response.violin4<-response.violin4_0+ylim(min(PREMIUM),60000)
ggMarginal(response.violin4, type = "density", alpha = 0.9, groupFill = TRUE)
## Warning: Removed 1773 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
## Warning: Removed 1773 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 1773 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1773 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
## Warning: Removed 1773 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 1773 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 5439 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Computation failed in `stat_density()`.
## Computation failed in `stat_density()`.
## Caused by error in `density.default()`:
## ! non-finite 'from'
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
###Response by Number of days in the contract Turns out it’s not a very
important variable.
dat %>%
filter(LICENSE=="0") %>%
ggplot(aes(x = RES, y = VINTAGE)) +
geom_violin() +
geom_boxplot(width = 0.05,colour = "grey",outlier.size = 0.01,outlier.color="green") +
geom_jitter(aes(colour=RES),size = 0.01, shape = 1, width=0.1,alpha=1/50) +
xlab("") +
ylab("Number of days since the insured
started a contract") +
theme_bw()+
theme(legend.position = "bottom")

response.violin5_0 <- dat %>%
filter(LICENSE=="1") %>%
ggplot(aes(x = RES, y = VINTAGE)) +
geom_violin(aes(colour=RES)) +
geom_boxplot(width = 0.05,colour = "grey",outlier.size = 0.01,outlier.color="green") +
geom_jitter(aes(colour=RES),size = 0.01, shape = 1, width=0.1,alpha=1/50) +
xlab("") +
ylab("Number of days since the insured
started a contract") +
theme_bw()
response.violin5_0

response.violin5 <- response.violin5_0 + ylim(min(VINTAGE),500) +
theme_bw()+
theme(legend.position = "bottom")
response.violin5
## Warning: Removed 1455 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
## Warning: Removed 1455 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 1510 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggMarginal(response.violin5, type = "density", alpha = 0.9, groupFill = TRUE)
## Warning: Removed 1455 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
## Warning: Removed 1455 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 1455 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
## Warning: Removed 1455 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 1528 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Computation failed in `stat_density()`.
## Computation failed in `stat_density()`.
## Caused by error in `density.default()`:
## ! non-finite 'from'
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.

##Response by Categorical Variables
plot1 <-dat %>%
group_by(CHANNEL,RES) %>%
summarise(count=n(), .groups = 'drop') %>%
arrange(desc(count)) %>%
slice_head(n = 10) %>%
ggplot(aes(x = CHANNEL, y = count, fill = RES)) +
geom_bar(stat = "identity") +
labs(title = "Response by Policy Sales Channel",
x = "",
y = "Count",
fill = "Response") +
theme_minimal() +
scale_fill_manual(values = c("0" = "lightblue", "1" = "salmon"))
plot1
plot2 <-dat %>%
group_by(REGION,RES) %>%
summarise(count=n(), .groups = 'drop') %>%
arrange(desc(count)) %>%
slice_head(n = 10) %>%
ggplot(aes(x = reorder(REGION, -count), y = count, fill = RES)) +
geom_bar(stat = "identity") +
labs(title = "Response by Region Code",
x = "",
y = "Count",
fill = "Response") +
theme_minimal() +
scale_fill_manual(values = c("0" = "lightblue", "1" = "salmon"))
plot2
plot3 <-dat %>%
group_by(AGE_GRPS,RES) %>%
summarise(count=n(), .groups = 'drop') %>%
arrange(desc(count)) %>%
ggplot(aes(x = reorder(AGE_GRPS, -count), y = count, fill = RES)) +
geom_bar(stat = "identity") +
labs(title = "Response by Age Group",
x = "",
y = "Count",
fill = "Response") +
theme_minimal() +
scale_fill_manual(values = c("0" = "lightblue", "1" = "salmon"))
plot3
plot4 <-dat %>%
group_by(VINTAGE_GRPS,RES) %>%
summarise(count=n(), .groups = 'drop') %>%
arrange(desc(count)) %>%
ggplot(aes(x = reorder(VINTAGE_GRPS, -count), y = count, fill = RES)) +
geom_bar(stat = "identity") +
labs(title = "Response by Days of loyalty",
x = "",
y = "Count",
fill = "Response") +
theme_minimal() +
scale_fill_manual(values = c("0" = "lightblue", "1" = "salmon"))
plot4
plot5 <-dat %>%
group_by(V_AGE,RES) %>%
summarise(count=n(), .groups = 'drop') %>%
arrange(desc(count)) %>%
ggplot(aes(x = reorder(V_AGE, -count), y = count, fill = RES)) +
geom_bar(stat = "identity") +
labs(title = "Response by Vehicle Age",
x = "",
y = "Count",
fill = "Response") +
theme_minimal() +
scale_fill_manual(values = c("0" = "lightblue", "1" = "salmon"))
plot5
plot6 <-dat %>%
group_by(PREV_INS,RES) %>%
summarise(count=n(), .groups = 'drop') %>%
arrange(desc(count)) %>%
ggplot(aes(x = reorder(PREV_INS, -count), y = count, fill = RES)) +
geom_bar(stat = "identity") +
labs(title = "Response by Previously Insured Situation",
x = "",
y = "Count",
fill = "Response") +
theme_minimal() +
scale_fill_manual(values = c("0" = "lightblue", "1" = "salmon"))
plot6
plot7 <-dat %>%
group_by(PREMIUM_GRPS,RES) %>%
summarise(count=n(), .groups = 'drop') %>%
arrange(desc(count)) %>%
ggplot(aes(x = reorder(PREMIUM_GRPS, -count), y = count, fill = RES)) +
geom_bar(stat = "identity") +
labs(title = "Response by Premium brackets",
x = "",
y = "Count",
fill = "Response") +
theme_minimal() +
scale_fill_manual(values = c("0" = "lightblue", "1" = "salmon"))
plot7
plot8 <-dat %>%
group_by(V_DAMAGE,RES) %>%
summarise(count=n(), .groups = 'drop') %>%
arrange(desc(count)) %>%
ggplot(aes(x = reorder(V_DAMAGE, -count), y = count, fill = RES)) +
geom_bar(stat = "identity") +
labs(title = "The Vehicle has been damaged before or not, by Resonse",
x = "",
y = "Count",
fill = "Response") +
theme_minimal() +
scale_fill_manual(values = c("0" = "lightblue", "1" = "salmon"))
plot8
dat %>%
group_by(VINTAGE_GRPS,PREMIUM_GRPS,AGE_GRPS) %>%
summarise(count=n(), .groups = 'drop') %>%
arrange(desc(count))
## # A tibble: 355 × 4
## VINTAGE_GRPS PREMIUM_GRPS AGE_GRPS count
## <fct> <fct> <fct> <int>
## 1 < 6 months 30000-40000 20-24 1781
## 2 < 6 months 20000-30000 20-24 1677
## 3 6 - 12 months 30000-40000 20-24 1294
## 4 < 6 months 30000-40000 45-49 1254
## 5 < 6 months 30000-40000 40-44 1226
## 6 < 6 months 30000-40000 25-29 1222
## 7 6 - 12 months 20000-30000 20-24 1203
## 8 < 6 months 20000-30000 25-29 1117
## 9 6 - 12 months 30000-40000 40-44 937
## 10 < 6 months 20000-30000 40-44 854
## # ℹ 345 more rows
m=glm(RES ~ GENDER + Age + Driving_License + Region_Code + Policy_Sales_Channel + Previously_Insured + Vehicle_Age + Vehicle_Damage + Annual_Premium + Vintage, data = train_13, family = binomial()) summary(m) cook1= cooks.distance(m) large_cd= cook1>4/length(cook1) length(cook1[large_cd])
#ML in Python
if (reticulate::py_available()) message("Python 3 found.")
if (reticulate::py_module_available("pandas")) message("'pandas' found.")
## 'pandas' found.
if (reticulate::py_module_available("matplotlib")) message("'matplotlib' found.")
## 'matplotlib' found.
if (reticulate::py_module_available("numpy")) message("'numpy' found.")
## 'numpy' found.
if (reticulate::py_module_available("xgboost")) message("'xgb' found.")
## 'xgb' found.
#!pip install matplotlib
#!pip install pandas
#!pip install numpy
#!pip3 install xgboost
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import xgboost as xgb
df=r.train_13
df.info
## <bound method DataFrame.info of id Gender Age ... Policy_Sales_Channel Vintage Response
## 0 1 Female 24.0 ... 152 292.0 0
## 1 2 Male 48.0 ... 26 115.0 0
## 2 3 Female 71.0 ... 163 113.0 1
## 3 4 Female 74.0 ... 152 156.0 0
## 4 5 Male 43.0 ... 124 200.0 0
## ... ... ... ... ... ... ... ...
## 49995 49996 Male 24.0 ... 26 234.0 0
## 49996 49997 Male 50.0 ... 26 208.0 0
## 49997 49998 Male 25.0 ... 152 191.0 1
## 49998 49999 Male 35.0 ... 156 202.0 0
## 49999 50000 Male 65.0 ... 30 105.0 0
##
## [50000 rows x 12 columns]>
df_test=r.test_13
df_test
## id Gender Age ... Annual_Premium Policy_Sales_Channel Vintage
## 0 1 Male 44.0 ... 40454.0 26 217.0
## 1 2 Female 60.0 ... 32363.0 124 102.0
## 2 3 Male 30.0 ... 24550.0 124 45.0
## 3 4 Female 26.0 ... 31136.0 152 186.0
## 4 5 Female 29.0 ... 32923.0 152 34.0
## ... ... ... ... ... ... ... ...
## 29995 29996 Male 33.0 ... 49455.0 122 198.0
## 29996 29997 Male 41.0 ... 42721.0 54 132.0
## 29997 29998 Male 25.0 ... 35369.0 152 43.0
## 29998 29999 Male 50.0 ... 43214.0 7 48.0
## 29999 30000 Female 34.0 ... 2630.0 124 208.0
##
## [30000 rows x 11 columns]
df_onehot = pd.get_dummies(df, dtype='int')
df_onehot
## Age Annual_Premium ... Response_0 Response_1
## 0 24.0 21795.0 ... 1 0
## 1 48.0 28274.0 ... 1 0
## 2 71.0 40297.0 ... 0 1
## 3 74.0 37214.0 ... 1 0
## 4 43.0 2575100.0 ... 1 0
## ... ... ... ... ... ...
## 49995 24.0 37012.0 ... 1 0
## 49996 50.0 43065.0 ... 1 0
## 49997 25.0 36435.0 ... 0 1
## 49998 35.0 26490.0 ... 1 0
## 49999 65.0 2630.0 ... 1 0
##
## [50000 rows x 50198 columns]
column_to_move = df_onehot.pop(“Response”) # insert column with insert(location, column_name, column_value) df_onehot.insert(len(df_onehot.columns), “Response”, column_to_move) df_onehot ```