Data Exploratory
plot_missing(raw)

plot_density(raw)

plot_intro(raw)

plot_density(raw$age, title="Age distribution")

ggplot(data=raw , aes(balance)) +
geom_bar(stat = "count") +
theme_bw()+
xlim(0, 125000)+
labs(y = 'Count' , x = 'Outstanding balance')
## Warning: Removed 3766 rows containing non-finite values (stat_count).
## Warning: Removed 1 rows containing missing values (geom_bar).

ggplot(data=raw, aes(x=duration)) + geom_bar(stat="bin",bins=30,fill="blue")

plot_correlation(raw,type="continuous")

plot_correlation(raw,type = "discrete")

ggplot(data = raw) +
geom_bar(mapping = aes(x = loan,fill=as.factor(loan)))

ggplot(data = raw) +
geom_bar(mapping = aes(x = job,fill=as.factor(loan)))+
theme(axis.text.x = element_text(angle = 90, hjust = 1))+
ggtitle("Job category vs Personal loan")

ggplot(data = raw) +
geom_bar(mapping = aes(x = job,fill=as.factor(housing)))+
theme(axis.text.x = element_text(angle = 90, hjust = 1))+
ggtitle("Job category vs Housing mortgage")

glimpse(raw)
## Observations: 45,211
## Variables: 17
## $ age <int> 58, 44, 33, 47, 33, 35, 28, 42, 58, 43, 41, 29, 53, ...
## $ job <fct> management, technician, entrepreneur, blue-collar, u...
## $ marital <fct> married, single, married, married, single, married, ...
## $ education <fct> tertiary, secondary, secondary, unknown, unknown, te...
## $ default <fct> no, no, no, no, no, no, no, yes, no, no, no, no, no,...
## $ balance <int> 2143, 29, 2, 1506, 1, 231, 447, 2, 121, 593, 270, 39...
## $ housing <fct> yes, yes, yes, yes, no, yes, yes, yes, yes, yes, yes...
## $ loan <fct> no, no, yes, no, no, no, yes, no, no, no, no, no, no...
## $ contact <fct> unknown, unknown, unknown, unknown, unknown, unknown...
## $ day <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5...
## $ month <fct> may, may, may, may, may, may, may, may, may, may, ma...
## $ duration <int> 261, 151, 76, 92, 198, 139, 217, 380, 50, 55, 222, 1...
## $ campaign <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ pdays <int> -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, ...
## $ previous <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ poutcome <fct> unknown, unknown, unknown, unknown, unknown, unknown...
## $ y <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, ...
plot_correlation(raw,type="continuous")

plot_correlation(raw,type = "discrete")

table(raw$loan)
##
## no yes
## 37967 7244
library(ggplot2)
Totalrows = nrow(raw)
raw %>%
group_by(loan) %>%
summarise(Count = n()/Totalrows*100) %>%
arrange(desc(Count)) %>%
ungroup() %>%
mutate(loan = reorder(loan,Count)) %>%
ggplot(aes(x = loan,y = Count)) +
geom_bar(stat='identity',fill= "blue") +
geom_text(aes(x = loan, y = 1, label = paste0("( ",round(Count,2)," %)",sep="")),
hjust=0, vjust=.5, size = 4, colour = 'white',
fontface = 'bold') +
labs(x = 'loan',
y = 'Percentage',
title = 'Loan and Count',
subtitle = '0 - No loan ; 1 - With loan') +
coord_flip() +
theme_bw()

boxplot(data=raw,age~job,col="blue",main="Median age for personal loan vs Job category")

library(ggplot2)
Totalrows = nrow(raw)
raw %>%
group_by(housing) %>%
summarise(Count = n()/Totalrows*100) %>%
arrange(desc(Count)) %>%
ungroup() %>%
mutate(housing = reorder(housing,Count)) %>%
ggplot(aes(x = housing,y = Count)) +
geom_bar(stat='identity',fill= "blue") +
geom_text(aes(x = housing, y = 1, label = paste0("( ",round(Count,2)," %)",sep="")),
hjust=0, vjust=.5, size = 4, colour = 'white',
fontface = 'bold') +
labs(x = 'housing',
y = 'Percentage',
title = 'Housing Mortgage and Count',
subtitle = '0 - No Mortage ; 1 - With Mortgage') +
coord_flip() +
theme_bw()

library(caret)
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
raw$loan <- ifelse(raw$loan == "yes", "1","0")
# Step 1: Get row numbers for the training data
index <- createDataPartition(raw$loan, p=0.8, list=FALSE)
# Step 2: Create the training dataset
trainData <- raw[index,]
# Step 3: Create the test dataset
testData <- raw[-index,]
#----- Generalized Model
trainData$loan <- as.factor(trainData$loan)
fitControl <- trainControl(method = "repeatedcv",
number = 3,
repeats = 10)
gbm_fit <- train(loan ~., data = trainData,
method = "gbm",
trControl = fitControl,
verbose = FALSE)
gbm_predict <- predict(gbm_fit,newdata=testData,type="prob")
gbm_predict <- round(gbm_predict$`1`)
gbm_cm <- confusionMatrix(as.factor(gbm_predict), as.factor(testData$loan))
gbm_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 7556 1375
## 1 37 73
##
## Accuracy : 0.8438
## 95% CI : (0.8362, 0.8512)
## No Information Rate : 0.8398
## P-Value [Acc > NIR] : 0.1543
##
## Kappa : 0.0727
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.99513
## Specificity : 0.05041
## Pos Pred Value : 0.84604
## Neg Pred Value : 0.66364
## Prevalence : 0.83984
## Detection Rate : 0.83575
## Detection Prevalence : 0.98783
## Balanced Accuracy : 0.52277
##
## 'Positive' Class : 0
##