Data Exploratory

plot_missing(raw)

plot_density(raw)

plot_intro(raw)

plot_density(raw$age, title="Age distribution")

ggplot(data=raw , aes(balance)) + 
        geom_bar(stat = "count") + 
        theme_bw()+
        xlim(0, 125000)+
        labs(y = 'Count' , x = 'Outstanding balance')
## Warning: Removed 3766 rows containing non-finite values (stat_count).
## Warning: Removed 1 rows containing missing values (geom_bar).

ggplot(data=raw, aes(x=duration)) + geom_bar(stat="bin",bins=30,fill="blue")

plot_correlation(raw,type="continuous")

plot_correlation(raw,type = "discrete")

ggplot(data = raw) +
  geom_bar(mapping = aes(x = loan,fill=as.factor(loan)))

ggplot(data = raw) +
  geom_bar(mapping = aes(x = job,fill=as.factor(loan)))+
   theme(axis.text.x = element_text(angle = 90, hjust = 1))+
  ggtitle("Job category vs Personal loan")  

ggplot(data = raw) +
  geom_bar(mapping = aes(x = job,fill=as.factor(housing)))+
   theme(axis.text.x = element_text(angle = 90, hjust = 1))+
  ggtitle("Job category vs Housing mortgage")

glimpse(raw)
## Observations: 45,211
## Variables: 17
## $ age       <int> 58, 44, 33, 47, 33, 35, 28, 42, 58, 43, 41, 29, 53, ...
## $ job       <fct> management, technician, entrepreneur, blue-collar, u...
## $ marital   <fct> married, single, married, married, single, married, ...
## $ education <fct> tertiary, secondary, secondary, unknown, unknown, te...
## $ default   <fct> no, no, no, no, no, no, no, yes, no, no, no, no, no,...
## $ balance   <int> 2143, 29, 2, 1506, 1, 231, 447, 2, 121, 593, 270, 39...
## $ housing   <fct> yes, yes, yes, yes, no, yes, yes, yes, yes, yes, yes...
## $ loan      <fct> no, no, yes, no, no, no, yes, no, no, no, no, no, no...
## $ contact   <fct> unknown, unknown, unknown, unknown, unknown, unknown...
## $ day       <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5...
## $ month     <fct> may, may, may, may, may, may, may, may, may, may, ma...
## $ duration  <int> 261, 151, 76, 92, 198, 139, 217, 380, 50, 55, 222, 1...
## $ campaign  <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ pdays     <int> -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, ...
## $ previous  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ poutcome  <fct> unknown, unknown, unknown, unknown, unknown, unknown...
## $ y         <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, ...
plot_correlation(raw,type="continuous")

plot_correlation(raw,type = "discrete")

table(raw$loan)
## 
##    no   yes 
## 37967  7244
library(ggplot2)

Totalrows = nrow(raw)

raw %>%
  group_by(loan) %>%
  summarise(Count = n()/Totalrows*100) %>%
  arrange(desc(Count)) %>%
  ungroup() %>%
  mutate(loan = reorder(loan,Count)) %>%
  
  ggplot(aes(x = loan,y = Count)) +
  geom_bar(stat='identity',fill= "blue") +
  geom_text(aes(x = loan, y = 1, label = paste0("( ",round(Count,2)," %)",sep="")),
            hjust=0, vjust=.5, size = 4, colour = 'white',
            fontface = 'bold') +
  labs(x = 'loan', 
       y = 'Percentage', 
       title = 'Loan and Count',
       subtitle = '0 - No loan ; 1 - With loan') +
  coord_flip() +
  theme_bw()

boxplot(data=raw,age~job,col="blue",main="Median age for personal loan vs Job category")

library(ggplot2)

Totalrows = nrow(raw)

raw %>%
  group_by(housing) %>%
  summarise(Count = n()/Totalrows*100) %>%
  arrange(desc(Count)) %>%
  ungroup() %>%
  mutate(housing = reorder(housing,Count)) %>%
  
  ggplot(aes(x = housing,y = Count)) +
  geom_bar(stat='identity',fill= "blue") +
  geom_text(aes(x = housing, y = 1, label = paste0("( ",round(Count,2)," %)",sep="")),
            hjust=0, vjust=.5, size = 4, colour = 'white',
            fontface = 'bold') +
  labs(x = 'housing', 
       y = 'Percentage', 
       title = 'Housing Mortgage and Count',
       subtitle = '0 - No Mortage ; 1 - With Mortgage') +
  coord_flip() +
  theme_bw()

library(caret)
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
raw$loan <- ifelse(raw$loan == "yes", "1","0")
# Step 1: Get row numbers for the training data
 index <- createDataPartition(raw$loan, p=0.8, list=FALSE) 
 
# Step 2: Create the training  dataset
 trainData <- raw[index,] 
 
# Step 3: Create the test dataset
 testData <- raw[-index,] 
#----- Generalized Model
trainData$loan <- as.factor(trainData$loan)
fitControl <- trainControl(method = "repeatedcv",
                           number = 3,
                           repeats = 10)

gbm_fit <- train(loan ~., data = trainData,
                 method = "gbm", 
                 trControl = fitControl,
                 verbose = FALSE)

gbm_predict <- predict(gbm_fit,newdata=testData,type="prob")

gbm_predict <- round(gbm_predict$`1`)
gbm_cm <- confusionMatrix(as.factor(gbm_predict), as.factor(testData$loan))
gbm_cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 7556 1375
##          1   37   73
##                                           
##                Accuracy : 0.8438          
##                  95% CI : (0.8362, 0.8512)
##     No Information Rate : 0.8398          
##     P-Value [Acc > NIR] : 0.1543          
##                                           
##                   Kappa : 0.0727          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.99513         
##             Specificity : 0.05041         
##          Pos Pred Value : 0.84604         
##          Neg Pred Value : 0.66364         
##              Prevalence : 0.83984         
##          Detection Rate : 0.83575         
##    Detection Prevalence : 0.98783         
##       Balanced Accuracy : 0.52277         
##                                           
##        'Positive' Class : 0               
##