#necessary libraries
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.1.0 ✓ dplyr 1.0.5
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(dbplyr)
##
## Attaching package: 'dbplyr'
## The following objects are masked from 'package:dplyr':
##
## ident, sql
library(boot)
library(plotROC)
#Import dataset and check the first six rows
travelInsurance <- read.csv(file = '/Users/tracylam/Desktop/Portfolio/R/TravelInsurancePrediction.csv')
head(travelInsurance)
## Age Employment.Type GraduateOrNot AnnualIncome FamilyMembers
## 1 31 Government Sector Yes 400000 6
## 2 31 Private Sector/Self Employed Yes 1250000 7
## 3 34 Private Sector/Self Employed Yes 500000 4
## 4 28 Private Sector/Self Employed Yes 700000 3
## 5 28 Private Sector/Self Employed Yes 700000 8
## 6 25 Private Sector/Self Employed No 1150000 4
## ChronicDiseases FrequentFlyer EverTravelledAbroad TravelInsurance
## 1 1 No No 0
## 2 0 No No 0
## 3 1 No No 1
## 4 1 No No 0
## 5 1 Yes No 0
## 6 0 No No 0
In this data set, a company is offering travel insurance to their customers. Through this project, I will use classification to see what factors will have the most influence in whether a customer will purchase the travel insurance.
In this data set, we have variables including, age, employment type, graduate, income, amount of family members, diseases, frequent fliers, whether someone traveled abroad and whether travel insurance was purchased.
#Checking to see if there is any nulls in each column
colSums(is.na(travelInsurance))
## Age Employment.Type GraduateOrNot AnnualIncome
## 0 0 0 0
## FamilyMembers ChronicDiseases FrequentFlyer EverTravelledAbroad
## 0 0 0 0
## TravelInsurance
## 0
Since there are no nulls within each column, this means there is not missing data.
In this section, I will be visually exploring some of the variables that may have an affect on whether a customer will purchase the travel insurance package.
#creating the plots to view the different distributions of the variables
#p1 showcases their ages
p1 <- ggplot(data=travelInsurance) +
geom_bar(mapping = aes(x = Age), fill = '#B3E2CD', color = 'black') +
geom_text(mapping = aes(x = Age, label = ..count..),stat = 'count', vjust=-0.50)
p1
#p2 showcases employment type
p2 <- ggplot(data=travelInsurance) +
geom_bar(mapping = aes(x = "", fill = Employment.Type), width = 1) +
labs(x = NULL, y = NULL)
p2 <- p2 + coord_polar(theta = "y") +
theme(aspect.ratio = 1) +
scale_fill_brewer(palette = "Pastel2")
p2
#p3 showcases if they are a frequent flyer or not
p3 <- ggplot(data=travelInsurance) +
geom_bar(mapping = aes(x = "", fill = FrequentFlyer), width = 1) +
labs(x = NULL, y = NULL)
p3 <- p3 + coord_polar(theta = "y") +
theme(aspect.ratio = 1) +
scale_fill_brewer(palette = "Pastel2")
p3
#p4 showcases annual income
p4 <- ggplot(data=travelInsurance, mapping = aes(x = AnnualIncome)) +
geom_histogram(mapping = aes(y=..density..), colour = "black", fill = "white")+
geom_density(alpha = 0.2, fill = "#B3E2CD")
p4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
In this section, I create the logistic regression model to be our classifier. This section also include different metrics of evaluation for our model.
#creating the logistic regression model
logit.fit <- glm(TravelInsurance ~ ., family = binomial(), data = travelInsurance)
summary(logit.fit)
##
## Call:
## glm(formula = TravelInsurance ~ ., family = binomial(), data = travelInsurance)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2227 -0.8096 -0.5503 0.7177 2.2791
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -5.405e+00 6.340e-01 -8.525
## Age 7.326e-02 1.851e-02 3.958
## Employment.TypePrivate Sector/Self Employed 9.857e-02 1.326e-01 0.743
## GraduateOrNotYes -1.813e-01 1.562e-01 -1.160
## AnnualIncome 1.565e-06 1.769e-07 8.844
## FamilyMembers 1.529e-01 3.359e-02 4.551
## ChronicDiseases 8.999e-02 1.211e-01 0.743
## FrequentFlyerYes 4.595e-01 1.365e-01 3.366
## EverTravelledAbroadYes 1.718e+00 1.532e-01 11.211
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## Age 7.57e-05 ***
## Employment.TypePrivate Sector/Self Employed 0.457220
## GraduateOrNotYes 0.245859
## AnnualIncome < 2e-16 ***
## FamilyMembers 5.34e-06 ***
## ChronicDiseases 0.457497
## FrequentFlyerYes 0.000764 ***
## EverTravelledAbroadYes < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2590.5 on 1986 degrees of freedom
## Residual deviance: 2068.3 on 1978 degrees of freedom
## AIC: 2086.3
##
## Number of Fisher Scoring iterations: 4
As seen in the logistic regression model above, the most significant predictors for the binary variable, TravelInsurance, include age, annual income, family members, and if they are a frequent flyer. This is due to the fact that the p-values of these variables are less than the significance level of 0.05.
#Predicting the conditional probabilities
logit.fit.prob <- predict(logit.fit, type = 'response')
#Bayes Rule
logit.fit.class <- ifelse(logit.fit.prob > 0.5, "1", "0") %>% as.factor()
#Calculating the misclassification error rate
mean(travelInsurance$TravelInsurance != logit.fit.class)
## [1] 0.2279819
#Using cross validation to evaluate the logistic model
#setting an appropriate cost function for binary response variable
cost <- function(r, pi = 0) mean(abs(r-pi) > 0.5)
#10-fold cross validation
cv.glm(travelInsurance, logit.fit, cost=cost, K=10)$delta[1]
## [1] 0.229995
#Creating a confusion matrix
confusion.matrix <- table(travelInsurance$TravelInsurance, logit.fit.class)
confusion.matrix
## logit.fit.class
## 0 1
## 0 1183 94
## 1 359 351
#Sensitivity
tp <- confusion.matrix[2,2]
fn <- confusion.matrix[2,1]
sensitivity <- tp/(tp+fn)
sensitivity
## [1] 0.4943662
#Specificity
tn <- confusion.matrix[1,1]
fp <- confusion.matrix[1,2]
specificity <- tn/(tn+fp)
specificity
## [1] 0.92639
#Visualizing the trade-off between sensitivity and specificity with ROC curve
roc.df <- tibble(observed = travelInsurance$TravelInsurance,
predicted = logit.fit.prob)
ggplot(data = roc.df, mapping = aes(d = observed, m = predicted)) +
geom_roc(labels = F)