knitr::opts_chunk$set(echo = TRUE)
Insurance provides financing, leverage, cushion against possible eventuality. It is an arrangement by which the Company undertakes to provide a guarantee of compensation for specified loss, damage, illness or death in return by paymnent of a specified premium.
Premium paid by the customer is the major revenue source for insurance companies. Default in premium payments results in significant revenue losses and hence insurance companies would like to know upfront which type of customers would default premium payments. The objective of this project is to predict the probability that a customer will default the premium payment, so that the insurance agent can proactively reach out to the policy holder to follow up for the payment of premium.
Problem Statement: *The project ultimate focus is to predict if the current customer(s) will Default in Future Payments
The Project will include the following: * Visual inspection of data (rows, columns, descriptive details) * Understanding of attributes (variable info, renaming if required) * Univariate analysis (distribution and spread for every continuous attribute, distribution of the data in categories for categorical ones) * Bivariate analysis (relationship between different variables, correlations) * Building a model that can predict the likelihood of a customer defaulting on premium payments (Who is likely to default) * Identifing the factors that drive higher default rate (Are there any characteristics of the customers who are likely to default?) * Propose a strategy for reducing default rates by using the model and other insights from the analysis (What should be done to reduce the default rates?)
The dataset contains the following details collected well over ten years on 79, 854 Policy Holders. (The exact number of years will be decided as we gain better understanding of the data) * id: Unique customer ID perc_premium_paid_by_cash_credit: What % of the premium was paid by cash payments? * age_in_days: age of the customer in days * Income: Income of the customer * Marital Status: Married/Unmarried, Married (1), unmarried (0) * Veh_owned: Number of vehicles owned (1-3) * Count_3-6_months_late: Number of times premium was paid 3-6 months late * Count_6-12_months_late: Number of times premium was paid 6-12 months late * Count_more_than_12_months_late: Number of times premium was paid more than 12 months late * Risk_score: Risk score of customer (similar to credit score) * No_of_dep: Number of dependents in the family of the customer (1-4) * Accommodation: Owned (1), Rented (0) no_of_premiums_paid: Number of premiums paid till date * sourcing_channel: Channel through which customer was sourced * residence_area_type: Residence type of the customer * premium : Total premium amount paid till now * default: Y variable - 0 indicates that customer has defaulted the premium and 1 indicates that customer has not defaulted
library(car)
## Loading required package: carData
library(MASS)
library(ggplot2)
library(gridExtra)
library(caTools)
library(e1071)
library(ROCR)
library(InformationValue)
library(class)
library(knitr)
library(corrplot)
## corrplot 0.84 loaded
library(dataframeexplorer)
setwd("~/Great Learning/Capstone Project")
library(readxl)
InsuranceDataset <- read_excel("Insurance Premium Default-Dataset.xlsx")
View(InsuranceDataset)
head(InsuranceDataset)
## # A tibble: 6 x 17
## id perc_premium_pa~ age_in_days Income `Count_3-6_mont~ `Count_6-12_mon~
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 0.317 11330 90050 0 0
## 2 2 0 30309 156080 0 0
## 3 3 0.015 16069 145020 1 0
## 4 4 0 23733 187560 0 0
## 5 5 0.888 19360 103050 7 3
## 6 6 0.512 16795 113500 0 0
## # ... with 11 more variables: Count_more_than_12_months_late <dbl>, `Marital
## # Status` <dbl>, Veh_Owned <dbl>, No_of_dep <dbl>, Accomodation <dbl>,
## # risk_score <dbl>, no_of_premiums_paid <dbl>, sourcing_channel <chr>,
## # residence_area_type <chr>, premium <dbl>, default <dbl>
tail(InsuranceDataset)
## # A tibble: 6 x 17
## id perc_premium_pa~ age_in_days Income `Count_3-6_mont~ `Count_6-12_mon~
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 79848 0.009 21545 133140 0 0
## 2 79849 0.249 25555 64420 0 0
## 3 79850 0.003 16797 660040 1 0
## 4 79851 0.012 24835 227760 0 0
## 5 79852 0.19 10959 153060 1 0
## 6 79853 0 19720 324030 0 0
## # ... with 11 more variables: Count_more_than_12_months_late <dbl>, `Marital
## # Status` <dbl>, Veh_Owned <dbl>, No_of_dep <dbl>, Accomodation <dbl>,
## # risk_score <dbl>, no_of_premiums_paid <dbl>, sourcing_channel <chr>,
## # residence_area_type <chr>, premium <dbl>, default <dbl>
str(InsuranceDataset)
## tibble [79,853 x 17] (S3: tbl_df/tbl/data.frame)
## $ id : num [1:79853] 1 2 3 4 5 6 7 8 9 10 ...
## $ perc_premium_paid_by_cash_credit: num [1:79853] 0.317 0 0.015 0 0.888 0.512 0 0.994 0.019 0.018 ...
## $ age_in_days : num [1:79853] 11330 30309 16069 23733 19360 ...
## $ Income : num [1:79853] 90050 156080 145020 187560 103050 ...
## $ Count_3-6_months_late : num [1:79853] 0 0 1 0 7 0 0 0 0 0 ...
## $ Count_6-12_months_late : num [1:79853] 0 0 0 0 3 0 0 0 0 0 ...
## $ Count_more_than_12_months_late : num [1:79853] 0 0 0 0 4 0 0 0 0 0 ...
## $ Marital Status : num [1:79853] 0 1 0 1 0 0 0 0 1 1 ...
## $ Veh_Owned : num [1:79853] 3 3 1 1 2 1 3 3 2 3 ...
## $ No_of_dep : num [1:79853] 3 1 1 1 1 4 4 2 4 3 ...
## $ Accomodation : num [1:79853] 1 1 1 0 0 0 1 0 1 1 ...
## $ risk_score : num [1:79853] 98.8 99.1 99.2 99.4 98.8 ...
## $ no_of_premiums_paid : num [1:79853] 8 3 14 13 15 4 8 4 8 8 ...
## $ sourcing_channel : chr [1:79853] "A" "A" "C" "A" ...
## $ residence_area_type : chr [1:79853] "Rural" "Urban" "Urban" "Urban" ...
## $ premium : num [1:79853] 5400 11700 18000 13800 7500 3300 20100 3300 5400 9600 ...
## $ default : num [1:79853] 1 1 1 1 0 1 1 1 1 1 ...
colnames(InsuranceDataset)
## [1] "id" "perc_premium_paid_by_cash_credit"
## [3] "age_in_days" "Income"
## [5] "Count_3-6_months_late" "Count_6-12_months_late"
## [7] "Count_more_than_12_months_late" "Marital Status"
## [9] "Veh_Owned" "No_of_dep"
## [11] "Accomodation" "risk_score"
## [13] "no_of_premiums_paid" "sourcing_channel"
## [15] "residence_area_type" "premium"
## [17] "default"
names(InsuranceDataset) [names(InsuranceDataset) == "perc_premium_paid_by_cash_credit"] <-"Perc_Premiumcash"
names(InsuranceDataset) [names(InsuranceDataset) == "age_in_days"] <-"Age"
names(InsuranceDataset) [names(InsuranceDataset) == "Count_3-6_months_late"] <-"Count3-6"
names(InsuranceDataset) [names(InsuranceDataset) == "Count_6-12_months_late"] <-"Count6_12"
names(InsuranceDataset) [names(InsuranceDataset) == "Count_more_than_12_months_late"] <-"Count12andmore"
names(InsuranceDataset) [names(InsuranceDataset) == "Marital Status"] <-"Marital_Status"
names(InsuranceDataset) [names(InsuranceDataset) == "No_of_dep"] <-"Dependents"
names(InsuranceDataset) [names(InsuranceDataset) == "risk_score"] <-"Risk_Score"
names(InsuranceDataset) [names(InsuranceDataset) == "no_of_premiums_paid"] <-"Premiums_Paid"
names(InsuranceDataset) [names(InsuranceDataset) == "sourcing_channel"] <-"Sourcing_Channel"
names(InsuranceDataset) [names(InsuranceDataset) == "residence_area_type"] <-"Residence"
names(InsuranceDataset) [names(InsuranceDataset) == "premium"] <-"Premium"
names(InsuranceDataset) [names(InsuranceDataset) == "default"] <-"Default"
colnames(InsuranceDataset)
## [1] "id" "Perc_Premiumcash" "Age" "Income"
## [5] "Count3-6" "Count6_12" "Count12andmore" "Marital_Status"
## [9] "Veh_Owned" "Dependents" "Accomodation" "Risk_Score"
## [13] "Premiums_Paid" "Sourcing_Channel" "Residence" "Premium"
## [17] "Default"
colSums(is.na(InsuranceDataset))
## id Perc_Premiumcash Age Income
## 0 0 0 0
## Count3-6 Count6_12 Count12andmore Marital_Status
## 0 0 0 0
## Veh_Owned Dependents Accomodation Risk_Score
## 0 0 0 0
## Premiums_Paid Sourcing_Channel Residence Premium
## 0 0 0 0
## Default
## 0
summary (InsuranceDataset)
## id Perc_Premiumcash Age Income
## Min. : 1 Min. :0.0000 Min. : 7670 Min. : 24030
## 1st Qu.:19964 1st Qu.:0.0340 1st Qu.:14974 1st Qu.: 108010
## Median :39927 Median :0.1670 Median :18625 Median : 166560
## Mean :39927 Mean :0.3143 Mean :18847 Mean : 208847
## 3rd Qu.:59890 3rd Qu.:0.5380 3rd Qu.:22636 3rd Qu.: 252090
## Max. :79853 Max. :1.0000 Max. :37602 Max. :90262600
## Count3-6 Count6_12 Count12andmore Marital_Status
## Min. : 0.0000 Min. : 0.00000 Min. : 0.00000 Min. :0.0000
## 1st Qu.: 0.0000 1st Qu.: 0.00000 1st Qu.: 0.00000 1st Qu.:0.0000
## Median : 0.0000 Median : 0.00000 Median : 0.00000 Median :0.0000
## Mean : 0.2484 Mean : 0.07809 Mean : 0.05994 Mean :0.4987
## 3rd Qu.: 0.0000 3rd Qu.: 0.00000 3rd Qu.: 0.00000 3rd Qu.:1.0000
## Max. :13.0000 Max. :17.00000 Max. :11.00000 Max. :1.0000
## Veh_Owned Dependents Accomodation Risk_Score
## Min. :1.000 Min. :1.000 Min. :0.0000 Min. :91.90
## 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:0.0000 1st Qu.:98.83
## Median :2.000 Median :3.000 Median :1.0000 Median :99.18
## Mean :1.998 Mean :2.503 Mean :0.5013 Mean :99.07
## 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:1.0000 3rd Qu.:99.52
## Max. :3.000 Max. :4.000 Max. :1.0000 Max. :99.89
## Premiums_Paid Sourcing_Channel Residence Premium
## Min. : 2.00 Length:79853 Length:79853 Min. : 1200
## 1st Qu.: 7.00 Class :character Class :character 1st Qu.: 5400
## Median :10.00 Mode :character Mode :character Median : 7500
## Mean :10.86 Mean :10925
## 3rd Qu.:14.00 3rd Qu.:13800
## Max. :60.00 Max. :60000
## Default
## Min. :0.0000
## 1st Qu.:1.0000
## Median :1.0000
## Mean :0.9374
## 3rd Qu.:1.0000
## Max. :1.0000
InsuranceDataset$AgeinYears <- InsuranceDataset$Age /365
InsuranceDataset$IncomeClass <- cut(InsuranceDataset$Income, breaks = c(0, 150000, 250000, 90262600), labels = c("low","middle","high"))
dim(InsuranceDataset)
## [1] 79853 19
summary(InsuranceDataset)
## id Perc_Premiumcash Age Income
## Min. : 1 Min. :0.0000 Min. : 7670 Min. : 24030
## 1st Qu.:19964 1st Qu.:0.0340 1st Qu.:14974 1st Qu.: 108010
## Median :39927 Median :0.1670 Median :18625 Median : 166560
## Mean :39927 Mean :0.3143 Mean :18847 Mean : 208847
## 3rd Qu.:59890 3rd Qu.:0.5380 3rd Qu.:22636 3rd Qu.: 252090
## Max. :79853 Max. :1.0000 Max. :37602 Max. :90262600
## Count3-6 Count6_12 Count12andmore Marital_Status
## Min. : 0.0000 Min. : 0.00000 Min. : 0.00000 Min. :0.0000
## 1st Qu.: 0.0000 1st Qu.: 0.00000 1st Qu.: 0.00000 1st Qu.:0.0000
## Median : 0.0000 Median : 0.00000 Median : 0.00000 Median :0.0000
## Mean : 0.2484 Mean : 0.07809 Mean : 0.05994 Mean :0.4987
## 3rd Qu.: 0.0000 3rd Qu.: 0.00000 3rd Qu.: 0.00000 3rd Qu.:1.0000
## Max. :13.0000 Max. :17.00000 Max. :11.00000 Max. :1.0000
## Veh_Owned Dependents Accomodation Risk_Score
## Min. :1.000 Min. :1.000 Min. :0.0000 Min. :91.90
## 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:0.0000 1st Qu.:98.83
## Median :2.000 Median :3.000 Median :1.0000 Median :99.18
## Mean :1.998 Mean :2.503 Mean :0.5013 Mean :99.07
## 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:1.0000 3rd Qu.:99.52
## Max. :3.000 Max. :4.000 Max. :1.0000 Max. :99.89
## Premiums_Paid Sourcing_Channel Residence Premium
## Min. : 2.00 Length:79853 Length:79853 Min. : 1200
## 1st Qu.: 7.00 Class :character Class :character 1st Qu.: 5400
## Median :10.00 Mode :character Mode :character Median : 7500
## Mean :10.86 Mean :10925
## 3rd Qu.:14.00 3rd Qu.:13800
## Max. :60.00 Max. :60000
## Default AgeinYears IncomeClass
## Min. :0.0000 Min. : 21.01 low :33495
## 1st Qu.:1.0000 1st Qu.: 41.02 middle:25779
## Median :1.0000 Median : 51.03 high :20579
## Mean :0.9374 Mean : 51.63
## 3rd Qu.:1.0000 3rd Qu.: 62.02
## Max. :1.0000 Max. :103.02
InsuranceDataset$IncomeClass <- as.factor(InsuranceDataset$IncomeClass)
InsuranceDataset$Marital_Status <- as.factor(InsuranceDataset$Marital_Status)
InsuranceDataset$Accomodation <- as.factor(InsuranceDataset$Accomodation)
InsuranceDataset$Sourcing_Channel <- as.factor(InsuranceDataset$Sourcing_Channel)
InsuranceDataset$Residence <- as.factor(InsuranceDataset$Residence)
InsuranceDataset$Default <- as.factor(InsuranceDataset$Default)
str(InsuranceDataset)
## tibble [79,853 x 19] (S3: tbl_df/tbl/data.frame)
## $ id : num [1:79853] 1 2 3 4 5 6 7 8 9 10 ...
## $ Perc_Premiumcash: num [1:79853] 0.317 0 0.015 0 0.888 0.512 0 0.994 0.019 0.018 ...
## $ Age : num [1:79853] 11330 30309 16069 23733 19360 ...
## $ Income : num [1:79853] 90050 156080 145020 187560 103050 ...
## $ Count3-6 : num [1:79853] 0 0 1 0 7 0 0 0 0 0 ...
## $ Count6_12 : num [1:79853] 0 0 0 0 3 0 0 0 0 0 ...
## $ Count12andmore : num [1:79853] 0 0 0 0 4 0 0 0 0 0 ...
## $ Marital_Status : Factor w/ 2 levels "0","1": 1 2 1 2 1 1 1 1 2 2 ...
## $ Veh_Owned : num [1:79853] 3 3 1 1 2 1 3 3 2 3 ...
## $ Dependents : num [1:79853] 3 1 1 1 1 4 4 2 4 3 ...
## $ Accomodation : Factor w/ 2 levels "0","1": 2 2 2 1 1 1 2 1 2 2 ...
## $ Risk_Score : num [1:79853] 98.8 99.1 99.2 99.4 98.8 ...
## $ Premiums_Paid : num [1:79853] 8 3 14 13 15 4 8 4 8 8 ...
## $ Sourcing_Channel: Factor w/ 5 levels "A","B","C","D",..: 1 1 3 1 1 2 3 1 1 1 ...
## $ Residence : Factor w/ 2 levels "Rural","Urban": 1 2 2 2 2 1 1 2 2 1 ...
## $ Premium : num [1:79853] 5400 11700 18000 13800 7500 3300 20100 3300 5400 9600 ...
## $ Default : Factor w/ 2 levels "0","1": 2 2 2 2 1 2 2 2 2 2 ...
## $ AgeinYears : num [1:79853] 31 83 44 65 53 ...
## $ IncomeClass : Factor w/ 3 levels "low","middle",..: 1 2 1 2 1 1 3 1 1 2 ...
summary(InsuranceDataset)
## id Perc_Premiumcash Age Income
## Min. : 1 Min. :0.0000 Min. : 7670 Min. : 24030
## 1st Qu.:19964 1st Qu.:0.0340 1st Qu.:14974 1st Qu.: 108010
## Median :39927 Median :0.1670 Median :18625 Median : 166560
## Mean :39927 Mean :0.3143 Mean :18847 Mean : 208847
## 3rd Qu.:59890 3rd Qu.:0.5380 3rd Qu.:22636 3rd Qu.: 252090
## Max. :79853 Max. :1.0000 Max. :37602 Max. :90262600
## Count3-6 Count6_12 Count12andmore Marital_Status
## Min. : 0.0000 Min. : 0.00000 Min. : 0.00000 0:40032
## 1st Qu.: 0.0000 1st Qu.: 0.00000 1st Qu.: 0.00000 1:39821
## Median : 0.0000 Median : 0.00000 Median : 0.00000
## Mean : 0.2484 Mean : 0.07809 Mean : 0.05994
## 3rd Qu.: 0.0000 3rd Qu.: 0.00000 3rd Qu.: 0.00000
## Max. :13.0000 Max. :17.00000 Max. :11.00000
## Veh_Owned Dependents Accomodation Risk_Score Premiums_Paid
## Min. :1.000 Min. :1.000 0:39823 Min. :91.90 Min. : 2.00
## 1st Qu.:1.000 1st Qu.:2.000 1:40030 1st Qu.:98.83 1st Qu.: 7.00
## Median :2.000 Median :3.000 Median :99.18 Median :10.00
## Mean :1.998 Mean :2.503 Mean :99.07 Mean :10.86
## 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:99.52 3rd Qu.:14.00
## Max. :3.000 Max. :4.000 Max. :99.89 Max. :60.00
## Sourcing_Channel Residence Premium Default AgeinYears
## A:43134 Rural:31670 Min. : 1200 0: 4998 Min. : 21.01
## B:16512 Urban:48183 1st Qu.: 5400 1:74855 1st Qu.: 41.02
## C:12039 Median : 7500 Median : 51.03
## D: 7559 Mean :10925 Mean : 51.63
## E: 609 3rd Qu.:13800 3rd Qu.: 62.02
## Max. :60000 Max. :103.02
## IncomeClass
## low :33495
## middle:25779
## high :20579
##
##
##
InsuranceDataset <- subset(InsuranceDataset, select = -c(3))
dim(InsuranceDataset)
## [1] 79853 18
install.packages("ggplot2")
## Warning: package 'ggplot2' is in use and will not be installed
library(ggplot2)
plot_histogram_n_boxplot = function(variable, variableNameString, binw){
h = ggplot(data = InsuranceDataset, aes(x= variable))+
labs(x = variableNameString,y ='count')+
geom_histogram(fill = 'green',col = 'white',binwidth = binw)+
geom_vline(aes(xintercept=mean(variable)),
color="black", linetype="dashed", size=0.5)
b = ggplot(data = InsuranceDataset, aes('',variable))+
geom_boxplot(outlier.colour = 'red',col = 'red',outlier.shape = 19)+
labs(x = '',y = variableNameString)+ coord_flip()
grid.arrange(h,b,ncol = 2)
}
plot_histogram_n_boxplot(InsuranceDataset$AgeinYears, 'Age', 1)
hist(InsuranceDataset$Income, col = "blue", main = "Income")
plot_histogram_n_boxplot(InsuranceDataset$Veh_Owned, 'Vehicle Owned', 1)
* The distribution of number of vehicle of owned by customers is evenly distributed between 1 to 3 vehicles * The proportion of customers that own 1,2 and 3 cars each are the same as seen in the chart above
plot_histogram_n_boxplot(InsuranceDataset$Dependents, 'Dependents', 1)
* The distribution of Dependents in family is also evenly distributed * Most of the customers have 2-3 dependents * The proportion of customers with 1,2,3 & 4 dependents is the same.
plot_histogram_n_boxplot(InsuranceDataset$Risk_Score, 'Risk Score', 1)
* The Risk Score is skwed to the right, with left outliers * The Median Risk Score is 99 * Majority of the Clients have very high risk score, with the higher proportion having 99% risk score
plot_stacked_barchart = function(variable, variableNamesString){ggplot(InsuranceDataset, aes(fill = Default, x = variable)) + geom_bar(position = "fill")+
labs(title = variableNamesString, y = '', x = '')+
scale_fill_manual(values = c("RED", "BLUE"))}
plot_stacked_barchart(InsuranceDataset$Marital_Status, 'Marital Status')
* Marital Status doesn’t seem to have any impact on the Default on Premium as the similar proportion of Default exists between the Married and the Non Married. * Most of the customers don’t Default on their Premiums
plot_stacked_barchart(InsuranceDataset$Accomodation, 'Accomodation')
* Accomodation doesn’t seem to have much impact on Default * Thus who own their accomodation may seem to have a light higher chance of Defaulting on their premiums
plot_stacked_barchart(InsuranceDataset$Sourcing_Channel, 'Sourcing Channel')
* Sourcing Channels tends to have varying impact on Default * Customers sourced from Channel A have the lowest chance of Defaulting * Customers sourced from Channel D and E have the higher Chance of Defaulting as compared to Channel A * Customers sourced from D Channel have the highest chance of Defaulting, followed by those from channels C & E, whose risk of Defaulting seems to be the same, followed by those from B, while Customers from A have the lowest risk of Defaulting
plot_stacked_barchart(InsuranceDataset$Residence, 'Residence')
* Residence doesn’t seem to have any impact on Default * Those from Rural areas tends to have a slightly higher chance of Defaulting
plot_stacked_barchart(InsuranceDataset$IncomeClass, 'Income Class')
* The Income Class of the customer tend to have a significant impact on Default * Those Customers from lower income groups have a higher chance of Defaulting than those from Middle Income Class, while the High Income Class Customers have the least likelihood of Defaulting as expected
boxplot(InsuranceDataset$Dependents~InsuranceDataset$Count6_12, col = "Blue", main = "No of Dependents versus Late Premium in 6-12 months")
* Payment of 6-12 months is more prevalent among the customers with higher number of Dependents
ggplot(InsuranceDataset,aes_string(x=InsuranceDataset$AgeinYears,fill="Default")) + geom_histogram(bins=50,alpha=0.5,colour='black')
* The lower the age the more the chances of Default on Premiums
ggplot(InsuranceDataset,aes_string(x=InsuranceDataset$Veh_Owned,fill="Default")) + geom_histogram(bins=50,alpha=0.5,colour='black')
* No of Vehicle Owned doesn’t have any effect on Defualt on Payment of Premium
ggplot(InsuranceDataset,aes_string(x=InsuranceDataset$`Count3-6`,fill="Default")) + geom_histogram(bins=50,alpha=0.5,colour='black')
* According to the data available, the lower the number of times premium was paid below 6 months, the more the likelihood of Default * THis doesn’t sound logical and we will still check the dat
ggplot(InsuranceDataset,aes_string(x=InsuranceDataset$Count12andmore,fill="Default")) + geom_histogram(bins=50,alpha=0.5,colour='black')
* The proportion of Customers that pay their premium more than 12 months late and those who Default on Premium is propotional to increasing delays in payment
ggplot(InsuranceDataset,aes_string(x=InsuranceDataset$Risk_Score,fill="Default")) + geom_histogram(bins=50,alpha=0.5,colour='black')
* The more the Risk Score the higher the chance of Default
ggplot(InsuranceDataset,aes_string(x=InsuranceDataset$Dependents,fill="Default")) + geom_histogram(bins=50,alpha=0.5,colour='black')
* No of Dependents have only a slight impact on Default; although the higher the number of Dependents the higher the chance of Default
install.packages("corrplot")
## Warning: package 'corrplot' is in use and will not be installed
library(corrplot)
InsuranceDataset$IncomeClass <- as.numeric(InsuranceDataset$IncomeClass)
InsuranceDataset$Marital_Status <- as.numeric(InsuranceDataset$Marital_Status)
InsuranceDataset$Accomodation <- as.numeric(InsuranceDataset$Accomodation)
InsuranceDataset$Sourcing_Channel <- as.numeric(InsuranceDataset$Sourcing_Channel)
InsuranceDataset$Residence <- as.numeric(InsuranceDataset$Residence)
InsuranceDataset$Default <- as.numeric(InsuranceDataset$Default)
M <- cor(InsuranceDataset)
corrplot(M, order = "AOE")
#======================================================================= # # T H E - E N D # #=======================================================================
knitr::purl(“Insurance Premium Default RMD.RMD”, documentation = 0)