Business Problem

The Bank executed a campaign to cross-sell Personal Loans. As part of their Pilot Campaign, 20000 customers were sent campaigns through email, sms, and direct mail. They were given an offer of Personal Loan at an attractive interest rate of 12% and processing fee waived off if they respond within 1 Month. 2512 customer expressed their interest and are marked as Target = 1. A model has to be build a Model using Supervised Learning Technique to finds profitable segments to target for cross-selling personal loans.

Data Understanding & Exploratory Data Analysis

Brief Analysis on data set revealed following:

–Customer Id is mainly for bank internal processing and will not be relevant for our model building and recommendations –Random Number will also be not required for analysis –Age Bracket and Age seems to reflect similar data and we will only proceed with Age –Gender column has three unique values and the value “Other” is not sure if this is correct or unknown data of customers about genders. For the time being, the Other has been assumed as another gender which further needs to be validated with Business Team –Read CSV file package of R could not understand the field “ACC_OP_DATE” field and explicit date conversion by as.Date function –Data is biassed and 87% of data has target value of 0 while only 13% of data has target value of 1 –Age Bracket and Age seems to be same information and Age has been kept for further analysis –Following Features having Outliers NO_OF_L_CR_TXNS NO_OF_L_DR_TXNS TOT_NO_OF_L_TXNS NO_OF_BR_CSH_WDL_DR_TXNS NO_OF_ATM_DR_TXNS NO_OF_NET_DR_TXNS NO_OF_MOB_DR_TXNS NO_OF_CHQ_DR_TXNS AMT_ATM_DR.V1 AMT_CHQ_DR.V1 AMT_NET_DR.V1 AMT_MOB_DR.V1 AMT_L_DR.V1 AMT_OTH_BK_ATM_USG_CHGS AMT_MIN_BAL_NMC_CHGS NO_OF_IW_CHQ_BNC_TXNS NO_OF_OW_CHQ_BNC_TXNS AVG_AMT_PER_ATM_TXN.V1 AVG_AMT_PER_CSH_WDL_TXN.V1 AVG_AMT_PER_CHQ_TXN.V1 AVG_AMT_PER_NET_TXN.V1 AVG_AMT_PER_MOB_TXN.V1

# Load packages for analysis and this section will have all the required libraries mentioned for better clarity
library('ggplot2') # visualization
## Warning: package 'ggplot2' was built under R version 3.4.1
library('ggthemes') # visualization
## Warning: package 'ggthemes' was built under R version 3.4.1
library('scales') # visualization
## Warning: package 'scales' was built under R version 3.4.1
library('dplyr') # data manipulation
## Warning: package 'dplyr' was built under R version 3.4.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library('mice') # imputation
## Warning: package 'mice' was built under R version 3.4.2
## Loading required package: lattice
library('randomForest') # classification algorithm
## Warning: package 'randomForest' was built under R version 3.4.1
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library('rpart') # for decision tree
## Warning: package 'rpart' was built under R version 3.4.2
library('ROCR')
## Warning: package 'ROCR' was built under R version 3.4.1
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.4.1
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library('ROCR')
library('randomForest')
library('corrr')
## Warning: package 'corrr' was built under R version 3.4.1
library('corrplot')
## Warning: package 'corrplot' was built under R version 3.4.2
## corrplot 0.84 loaded
library('glue')
## Warning: package 'glue' was built under R version 3.4.2
## 
## Attaching package: 'glue'
## The following object is masked from 'package:dplyr':
## 
##     collapse
library('caTools')
## Warning: package 'caTools' was built under R version 3.4.1
library('data.table')
## Warning: package 'data.table' was built under R version 3.4.2
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
require("GGally")
## Loading required package: GGally
## Warning: package 'GGally' was built under R version 3.4.3
## 
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
## 
##     nasa
require("geosphere")
## Loading required package: geosphere
## Warning: package 'geosphere' was built under R version 3.4.2
require("gmapsdistance")
## Loading required package: gmapsdistance
## Warning: package 'gmapsdistance' was built under R version 3.4.2
require("tidyr")
## Loading required package: tidyr
## Warning: package 'tidyr' was built under R version 3.4.2
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:mice':
## 
##     complete
library('corrplot')
#source("distance.R")
library('car')
## Warning: package 'car' was built under R version 3.4.2
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library('caret')
## Warning: package 'caret' was built under R version 3.4.3
library('gclus')
## Warning: package 'gclus' was built under R version 3.4.1
## Loading required package: cluster
## Warning: package 'cluster' was built under R version 3.4.2
library('visdat')
## Warning: package 'visdat' was built under R version 3.4.1
library('psych')
## Warning: package 'psych' was built under R version 3.4.2
## 
## Attaching package: 'psych'
## The following object is masked from 'package:car':
## 
##     logit
## The following object is masked from 'package:randomForest':
## 
##     outlier
## The following objects are masked from 'package:scales':
## 
##     alpha, rescale
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library('leaflet')
## Warning: package 'leaflet' was built under R version 3.4.1
library('leaflet.extras')
## Warning: package 'leaflet.extras' was built under R version 3.4.1
library("PerformanceAnalytics")
## Warning: package 'PerformanceAnalytics' was built under R version 3.4.2
## Loading required package: xts
## Warning: package 'xts' was built under R version 3.4.1
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.4.1
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'xts'
## The following object is masked from 'package:leaflet':
## 
##     addLegend
## The following objects are masked from 'package:data.table':
## 
##     first, last
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:gplots':
## 
##     textplot
## The following object is masked from 'package:graphics':
## 
##     legend
library('GPArotation')
## Warning: package 'GPArotation' was built under R version 3.4.1
library('MVN')
## Warning: package 'MVN' was built under R version 3.4.2
## sROC 0.1-2 loaded
## 
## Attaching package: 'MVN'
## The following object is masked from 'package:psych':
## 
##     mardia
library('psych')
library('MASS')
## Warning: package 'MASS' was built under R version 3.4.1
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library('psy')
## Warning: package 'psy' was built under R version 3.4.1
## 
## Attaching package: 'psy'
## The following object is masked from 'package:psych':
## 
##     wkappa
library('corpcor')
## Warning: package 'corpcor' was built under R version 3.4.1
library('fastmatch')
## Warning: package 'fastmatch' was built under R version 3.4.1
## 
## Attaching package: 'fastmatch'
## The following object is masked from 'package:dplyr':
## 
##     coalesce
library('plyr')
## Warning: package 'plyr' was built under R version 3.4.1
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
library('car')
library("PerformanceAnalytics")
library('ggcorrplot')
## Warning: package 'ggcorrplot' was built under R version 3.4.2
library('cluster')
library('caTools')
library('rpart')
library('rpart.plot')
## Warning: package 'rpart.plot' was built under R version 3.4.2
library('rattle')
## Warning: package 'rattle' was built under R version 3.4.2
## Rattle: A free graphical interface for data science with R.
## Version 5.1.0 Copyright (c) 2006-2017 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
## 
## Attaching package: 'rattle'
## The following object is masked from 'package:randomForest':
## 
##     importance
library('RColorBrewer')
## Warning: package 'RColorBrewer' was built under R version 3.4.1
library('data.table')
library('ROCR')
library('maptree')
## Warning: package 'maptree' was built under R version 3.4.2
library('tree')
## Warning: package 'tree' was built under R version 3.4.3
library('dummies') # for converting categorical into dummy one
## Warning: package 'dummies' was built under R version 3.4.1
## dummies-1.5.6 provided by Decision Patterns
library('caret')
library('pscl') ## for  McFadden R2
## Warning: package 'pscl' was built under R version 3.4.3
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
library('randomForest')
library('StatMeasures')
## Warning: package 'StatMeasures' was built under R version 3.4.3
library('sqldf')
## Warning: package 'sqldf' was built under R version 3.4.3
## Loading required package: gsubfn
## Warning: package 'gsubfn' was built under R version 3.4.1
## Loading required package: proto
## Warning: package 'proto' was built under R version 3.4.1
## Loading required package: RSQLite
## Warning: package 'RSQLite' was built under R version 3.4.1
library('purrr')
## Warning: package 'purrr' was built under R version 3.4.3
## 
## Attaching package: 'purrr'
## The following object is masked from 'package:plyr':
## 
##     compact
## The following object is masked from 'package:caret':
## 
##     lift
## The following object is masked from 'package:car':
## 
##     some
## The following object is masked from 'package:data.table':
## 
##     transpose
## The following object is masked from 'package:scales':
## 
##     discard
library('tidyr')
library('ggplot2')
library('gains')
## Warning: package 'gains' was built under R version 3.4.1
bankcampaigndata <- read.csv('bankdatafile.csv')
mydatac <- bankcampaigndata
print("No of Observation in the File:\n")
## [1] "No of Observation in the File:\n"
nrow(bankcampaigndata)
## [1] 20000
bankcampaigndata$ACC_OP_DATE <- as.Date(bankcampaigndata$ACC_OP_DATE,"%m/%d/%y")
str(bankcampaigndata)
## 'data.frame':    20000 obs. of  40 variables:
##  $ CUST_ID                 : Factor w/ 20000 levels "C1","C10","C100",..: 17699 16532 11027 17984 2363 11747 18115 15556 15216 12494 ...
##  $ TARGET                  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ AGE                     : int  27 47 40 53 36 42 30 53 42 30 ...
##  $ GENDER                  : Factor w/ 3 levels "F","M","O": 2 2 2 2 2 1 2 1 1 2 ...
##  $ BALANCE                 : num  3384 287489 18217 71720 1671623 ...
##  $ OCCUPATION              : Factor w/ 4 levels "PROF","SAL","SELF-EMP",..: 3 2 3 2 1 1 1 2 3 1 ...
##  $ AGE_BKT                 : Factor w/ 7 levels "<25",">50","26-30",..: 3 7 5 2 5 6 3 2 6 3 ...
##  $ SCR                     : int  776 324 603 196 167 493 479 562 105 170 ...
##  $ HOLDING_PERIOD          : int  30 28 2 13 24 26 14 25 15 13 ...
##  $ ACC_TYPE                : Factor w/ 2 levels "CA","SA": 2 2 2 1 2 2 2 1 2 2 ...
##  $ ACC_OP_DATE             : Date, format: "2020-03-23" "2020-10-11" ...
##  $ LEN_OF_RLTN_IN_MNTH     : int  146 104 61 107 185 192 177 99 88 111 ...
##  $ NO_OF_L_CR_TXNS         : int  7 8 10 36 20 5 6 14 18 14 ...
##  $ NO_OF_L_DR_TXNS         : int  3 2 5 14 1 2 6 3 14 8 ...
##  $ TOT_NO_OF_L_TXNS        : int  10 10 15 50 21 7 12 17 32 22 ...
##  $ NO_OF_BR_CSH_WDL_DR_TXNS: int  0 0 1 4 1 1 0 3 6 3 ...
##  $ NO_OF_ATM_DR_TXNS       : int  1 1 1 2 0 1 1 0 2 1 ...
##  $ NO_OF_NET_DR_TXNS       : int  2 1 1 3 0 0 1 0 4 0 ...
##  $ NO_OF_MOB_DR_TXNS       : int  0 0 0 1 0 0 0 0 1 0 ...
##  $ NO_OF_CHQ_DR_TXNS       : int  0 0 2 4 0 0 4 0 1 4 ...
##  $ FLG_HAS_CC              : int  0 0 0 0 0 1 0 0 1 0 ...
##  $ AMT_ATM_DR              : int  13100 6600 11200 26100 0 18500 6200 0 35400 18000 ...
##  $ AMT_BR_CSH_WDL_DR       : int  0 0 561120 673590 808480 379310 0 945160 198430 869880 ...
##  $ AMT_CHQ_DR              : int  0 0 49320 60780 0 0 10580 0 51490 32610 ...
##  $ AMT_NET_DR              : num  973557 799813 997570 741506 0 ...
##  $ AMT_MOB_DR              : int  0 0 0 71388 0 0 0 0 170332 0 ...
##  $ AMT_L_DR                : num  986657 806413 1619210 1573364 808480 ...
##  $ FLG_HAS_ANY_CHGS        : int  0 1 1 0 0 0 1 0 0 0 ...
##  $ AMT_OTH_BK_ATM_USG_CHGS : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ AMT_MIN_BAL_NMC_CHGS    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ NO_OF_IW_CHQ_BNC_TXNS   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ NO_OF_OW_CHQ_BNC_TXNS   : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ AVG_AMT_PER_ATM_TXN     : num  13100 6600 11200 13050 0 ...
##  $ AVG_AMT_PER_CSH_WDL_TXN : num  0 0 561120 168398 808480 ...
##  $ AVG_AMT_PER_CHQ_TXN     : num  0 0 24660 15195 0 ...
##  $ AVG_AMT_PER_NET_TXN     : num  486779 799813 997570 247169 0 ...
##  $ AVG_AMT_PER_MOB_TXN     : num  0 0 0 71388 0 ...
##  $ FLG_HAS_NOMINEE         : int  1 1 1 1 1 1 0 1 1 0 ...
##  $ FLG_HAS_OLD_LOAN        : int  1 0 1 0 0 1 1 1 1 0 ...
##  $ random                  : num  1.14e-05 1.11e-04 1.20e-04 1.37e-04 1.74e-04 ...
bankcampaigndata <- within(bankcampaigndata, rm('AGE_BKT'))  ## removing the age bracket column as this seems to redundent here

print("Categorisation of Target Data:\n")
## [1] "Categorisation of Target Data:\n"
table(bankcampaigndata$TARGET)  ## Data seems to be biassed here
## 
##     0     1 
## 17488  2512
bankcampaigndata_analysis <- bankcampaigndata ## taking data into anothet frame for further manipulation
summary(bankcampaigndata)
##     CUST_ID          TARGET            AGE        GENDER   
##  C1     :    1   Min.   :0.0000   Min.   :21.00   F: 5433  
##  C10    :    1   1st Qu.:0.0000   1st Qu.:30.00   M:14376  
##  C100   :    1   Median :0.0000   Median :38.00   O:  191  
##  C1000  :    1   Mean   :0.1256   Mean   :38.42            
##  C10000 :    1   3rd Qu.:0.0000   3rd Qu.:46.00            
##  C10001 :    1   Max.   :1.0000   Max.   :55.00            
##  (Other):19994                                             
##     BALANCE           OCCUPATION        SCR        HOLDING_PERIOD 
##  Min.   :      0   PROF    :5417   Min.   :100.0   Min.   : 1.00  
##  1st Qu.:  64754   SAL     :5855   1st Qu.:227.0   1st Qu.: 7.00  
##  Median : 231676   SELF-EMP:3568   Median :364.0   Median :15.00  
##  Mean   : 511362   SENP    :5160   Mean   :440.2   Mean   :14.96  
##  3rd Qu.: 653877                   3rd Qu.:644.0   3rd Qu.:22.00  
##  Max.   :8360431                   Max.   :999.0   Max.   :31.00  
##                                                                   
##  ACC_TYPE    ACC_OP_DATE         LEN_OF_RLTN_IN_MNTH NO_OF_L_CR_TXNS
##  CA: 4241   Min.   :2019-01-02   Min.   : 29.0       Min.   : 0.00  
##  SA:15759   1st Qu.:2020-03-16   1st Qu.: 79.0       1st Qu.: 6.00  
##             Median :2020-06-20   Median :125.0       Median :10.00  
##             Mean   :2020-06-09   Mean   :125.2       Mean   :12.35  
##             3rd Qu.:2020-09-25   3rd Qu.:172.0       3rd Qu.:14.00  
##             Max.   :2020-12-31   Max.   :221.0       Max.   :75.00  
##                                                                     
##  NO_OF_L_DR_TXNS  TOT_NO_OF_L_TXNS NO_OF_BR_CSH_WDL_DR_TXNS
##  Min.   : 0.000   Min.   :  0.00   Min.   : 0.000          
##  1st Qu.: 2.000   1st Qu.:  9.00   1st Qu.: 1.000          
##  Median : 5.000   Median : 14.00   Median : 1.000          
##  Mean   : 6.634   Mean   : 18.98   Mean   : 1.883          
##  3rd Qu.: 7.000   3rd Qu.: 21.00   3rd Qu.: 2.000          
##  Max.   :74.000   Max.   :149.00   Max.   :15.000          
##                                                            
##  NO_OF_ATM_DR_TXNS NO_OF_NET_DR_TXNS NO_OF_MOB_DR_TXNS NO_OF_CHQ_DR_TXNS
##  Min.   : 0.000    Min.   : 0.000    Min.   : 0.0000   Min.   : 0.000   
##  1st Qu.: 0.000    1st Qu.: 0.000    1st Qu.: 0.0000   1st Qu.: 0.000   
##  Median : 1.000    Median : 0.000    Median : 0.0000   Median : 2.000   
##  Mean   : 1.029    Mean   : 1.172    Mean   : 0.4118   Mean   : 2.138   
##  3rd Qu.: 1.000    3rd Qu.: 1.000    3rd Qu.: 0.0000   3rd Qu.: 4.000   
##  Max.   :25.000    Max.   :22.000    Max.   :25.0000   Max.   :15.000   
##                                                                         
##    FLG_HAS_CC       AMT_ATM_DR     AMT_BR_CSH_WDL_DR   AMT_CHQ_DR     
##  Min.   :0.0000   Min.   :     0   Min.   :     0    Min.   :      0  
##  1st Qu.:0.0000   1st Qu.:     0   1st Qu.:  2990    1st Qu.:      0  
##  Median :0.0000   Median :  6900   Median :340150    Median :  23840  
##  Mean   :0.3054   Mean   : 10990   Mean   :378475    Mean   : 124520  
##  3rd Qu.:1.0000   3rd Qu.: 15800   3rd Qu.:674675    3rd Qu.:  72470  
##  Max.   :1.0000   Max.   :199300   Max.   :999930    Max.   :4928640  
##                                                                       
##    AMT_NET_DR       AMT_MOB_DR        AMT_L_DR       FLG_HAS_ANY_CHGS
##  Min.   :     0   Min.   :     0   Min.   :      0   Min.   :0.0000  
##  1st Qu.:     0   1st Qu.:     0   1st Qu.: 237936   1st Qu.:0.0000  
##  Median :     0   Median :     0   Median : 695115   Median :0.0000  
##  Mean   :237308   Mean   : 22425   Mean   : 773717   Mean   :0.1106  
##  3rd Qu.:473971   3rd Qu.:     0   3rd Qu.:1078927   3rd Qu.:0.0000  
##  Max.   :999854   Max.   :199667   Max.   :6514921   Max.   :1.0000  
##                                                                      
##  AMT_OTH_BK_ATM_USG_CHGS AMT_MIN_BAL_NMC_CHGS NO_OF_IW_CHQ_BNC_TXNS
##  Min.   :  0.000         Min.   :  0.000      Min.   :0.00000      
##  1st Qu.:  0.000         1st Qu.:  0.000      1st Qu.:0.00000      
##  Median :  0.000         Median :  0.000      Median :0.00000      
##  Mean   :  1.099         Mean   :  1.292      Mean   :0.04275      
##  3rd Qu.:  0.000         3rd Qu.:  0.000      3rd Qu.:0.00000      
##  Max.   :250.000         Max.   :170.000      Max.   :2.00000      
##                                                                    
##  NO_OF_OW_CHQ_BNC_TXNS AVG_AMT_PER_ATM_TXN AVG_AMT_PER_CSH_WDL_TXN
##  Min.   :0.0000        Min.   :    0       Min.   :     0         
##  1st Qu.:0.0000        1st Qu.:    0       1st Qu.:  1266         
##  Median :0.0000        Median : 6000       Median :147095         
##  Mean   :0.0444        Mean   : 7409       Mean   :242237         
##  3rd Qu.:0.0000        3rd Qu.:13500       3rd Qu.:385000         
##  Max.   :2.0000        Max.   :25000       Max.   :999640         
##                                                                   
##  AVG_AMT_PER_CHQ_TXN AVG_AMT_PER_NET_TXN AVG_AMT_PER_MOB_TXN
##  Min.   :     0      Min.   :     0      Min.   :     0     
##  1st Qu.:     0      1st Qu.:     0      1st Qu.:     0     
##  Median :  8645      Median :     0      Median :     0     
##  Mean   : 25093      Mean   :179059      Mean   : 20304     
##  3rd Qu.: 28605      3rd Qu.:257699      3rd Qu.:     0     
##  Max.   :537842      Max.   :999854      Max.   :199667     
##                                                             
##  FLG_HAS_NOMINEE  FLG_HAS_OLD_LOAN     random         
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000114  
##  1st Qu.:1.0000   1st Qu.:0.0000   1st Qu.:0.2481866  
##  Median :1.0000   Median :0.0000   Median :0.5061214  
##  Mean   :0.9012   Mean   :0.4929   Mean   :0.5019330  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.7535712  
##  Max.   :1.0000   Max.   :1.0000   Max.   :0.9999471  
## 
###selecting data and running query on data frame

banking_duplicate_Indicator <- sqldf('SELECT count(cust_id),CUST_ID FROM bankcampaigndata GROUP BY CUST_ID ORDER BY CUST_ID')
write.csv(banking_duplicate_Indicator, file = "banking_duplicate_Indicator.csv")

SQL Summary shows following details w.r.t data analysis

–Age Bracket 26-30 have highest no of credit transactions but age bracket 31-35 number of transaction is also very near to that number. Age Bracket 26-30 seems to withdraw higher amount of cash from ATM. Other Age Bracket peoples average cash withdrawal. –Credit Card Transaction seems to be most popular mode of transaction for all the Age Bracket. Debit Card Transaction mode is the next preferable mode of transaction and “Mobile Banking Debit Transaction” is the least preferred mode of transaction for all the age bracket. –Age Bracket “26-30”, “31-35” and “41-45” are the set of peoples having higher number of transaction –Age Bracket “<25” have almost same average cash withdrawal and average amount debited per cheque transaction –“Avg. Amt withdrawn per Cash Withdrawal Transaction” is highest and then “Avg. Amt debited per Net Transaction” is second highest –“Avg. Amt debited per Mobile Banking Transaction” is low for all age bracket and this is mainly due to lower number of “Mobile Banking Debit Transaction” –No of Transaction and Average cash withdrawal through different mode does not depend on whether the customer has old loan or not –Unexpectedly the customers having no credit card made much higher no of credit transaction and it is not clear how the customers who does not have CC have made 173423 no of credit transaction. This needs to be validated from Business Team and this may be some data issue. –“Avg. Amt debited per Mobile Banking Transaction” is low for all age bracket and this is mainly due to lower number of “Mobile Banking Debit Transaction” –Average Cash withdrawal through different mode (cheque, cash and Net) surprisingly seems to be much more for customers who does not hold credit card –Cash Transaction seems to be preferred mode across all the age bracket; Net Transaction seems to be 2nd most preferable and mobile transaction seems to be picking up and becoming popular similar to cheque transaction –Age Bracket “>50” seems to using the mobile banking most among all the age bracket

qqmysummary <- sqldf('select AGE_BKT, sum(NO_OF_L_CR_TXNS) "No. of Credit Transactions",sum(NO_OF_L_DR_TXNS) "No. of Debit Transactions", sum(NO_OF_ATM_DR_TXNS) "No. of ATM Debit Transactions",sum(NO_OF_NET_DR_TXNS) "No. of Net Debit Transactions",sum(NO_OF_MOB_DR_TXNS) "No. of Mobile Banking Debit Transactions",sum(NO_OF_CHQ_DR_TXNS) "No. of Cheque Debit Transactions", avg(AVG_AMT_PER_ATM_TXN) "Avg. Amt withdrawn per ATM Transaction", avg(AVG_AMT_PER_CSH_WDL_TXN) "Avg. Amt withdrawn per Cash Withdrawal Transaction",avg(AVG_AMT_PER_CHQ_TXN) "Avg. Amt debited per Cheque Transaction", avg(AVG_AMT_PER_NET_TXN) "Avg. Amt debited per Net Transaction", avg(AVG_AMT_PER_MOB_TXN) "Avg. Amt debited per Mobile Banking Transaction" FROM mydatac group by AGE_BKT ORDER by AGE_BKT')
write.csv(qqmysummary, file = "qqsumamry.csv")



# sqldf('SELECT count(cust_id),CUST_ID FROM bankcampaigndata GROUP BY CUST_ID ORDER BY CUST_ID')
# sqldf('SELECT distinct OCCUPATION FROM bankcampaigndata ')
mysummary <- sqldf('select AGE_BKT, sum(NO_OF_L_CR_TXNS) "No. of Credit Transactions",sum(NO_OF_L_DR_TXNS) "No. of Debit Transactions", sum(NO_OF_ATM_DR_TXNS) "No. of ATM Debit Transactions",sum(NO_OF_NET_DR_TXNS) "No. of Net Debit Transactions",sum(NO_OF_MOB_DR_TXNS) "No. of Mobile Banking Debit Transactions",sum(NO_OF_CHQ_DR_TXNS) "No. of Cheque Debit Transactions", sum(AVG_AMT_PER_ATM_TXN) "Avg. Amt withdrawn per ATM Transaction", sum(AVG_AMT_PER_CSH_WDL_TXN) "Avg. Amt withdrawn per Cash Withdrawal Transaction",sum(AVG_AMT_PER_CHQ_TXN) "Avg. Amt debited per Cheque Transaction", sum(AVG_AMT_PER_NET_TXN) "Avg. Amt debited per Net Transaction", sum(AVG_AMT_PER_MOB_TXN) "Avg. Amt debited per Mobile Banking Transaction" FROM mydatac group by AGE_BKT ORDER by AGE_BKT')
write.csv(mysummary, file = "sumamry.csv")

mynewsummary <- sqldf('select FLG_HAS_OLD_LOAN, sum(NO_OF_L_CR_TXNS) "No. of Credit Transactions",sum(NO_OF_L_DR_TXNS) "No. of Debit Transactions", sum(NO_OF_ATM_DR_TXNS) "No. of ATM Debit Transactions",sum(NO_OF_NET_DR_TXNS) "No. of Net Debit Transactions",sum(NO_OF_MOB_DR_TXNS) "No. of Mobile Banking Debit Transactions",sum(NO_OF_CHQ_DR_TXNS) "No. of Cheque Debit Transactions", sum(AVG_AMT_PER_ATM_TXN) "Avg. Amt withdrawn per ATM Transaction", sum(AVG_AMT_PER_CSH_WDL_TXN) "Avg. Amt withdrawn per Cash Withdrawal Transaction",sum(AVG_AMT_PER_CHQ_TXN) "Avg. Amt debited per Cheque Transaction", sum(AVG_AMT_PER_NET_TXN) "Avg. Amt debited per Net Transaction", sum(AVG_AMT_PER_MOB_TXN) "Avg. Amt debited per Mobile Banking Transaction" FROM mydatac group by FLG_HAS_OLD_LOAN')
write.csv(mynewsummary, file = "newsumamry.csv")


mynewccsummary <- sqldf('select FLG_HAS_CC, sum(NO_OF_L_CR_TXNS) "No. of Credit Transactions",sum(NO_OF_L_DR_TXNS) "No. of Debit Transactions", sum(NO_OF_ATM_DR_TXNS) "No. of ATM Debit Transactions",sum(NO_OF_NET_DR_TXNS) "No. of Net Debit Transactions",sum(NO_OF_MOB_DR_TXNS) "No. of Mobile Banking Debit Transactions",sum(NO_OF_CHQ_DR_TXNS) "No. of Cheque Debit Transactions", sum(AVG_AMT_PER_ATM_TXN) "Avg. Amt withdrawn per ATM Transaction", sum(AVG_AMT_PER_CSH_WDL_TXN) "Avg. Amt withdrawn per Cash Withdrawal Transaction",sum(AVG_AMT_PER_CHQ_TXN) "Avg. Amt debited per Cheque Transaction", sum(AVG_AMT_PER_NET_TXN) "Avg. Amt debited per Net Transaction", sum(AVG_AMT_PER_MOB_TXN) "Avg. Amt debited per Mobile Banking Transaction" FROM mydatac group by FLG_HAS_CC')
write.csv(mynewccsummary, file = "flagccnewsumamry.csv")

QuICK code to do histogram for all numeric variables from the data.. Customers having average age of 30 have registered maximum number of transaction in shared dataset Most of the transition having very minimal balance SCR 250 having maximum frequency Holding Period of 15 having maximum frequency No. of Credit Transactions 5-6 having higher frequency No. of Debit Transactions 5-6 having higher frequency Total No. of Transaction 10-15 having maximum frequency followed by closely 15-20 Transaction history shows that higher frequency

bankcampaigndata %>%
  keep(is.numeric) %>% 
  gather() %>% 
  ggplot(aes(value)) +
    facet_wrap(~ key, scales = "free") +
    geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## density diagram for all the variables
bankcampaigndata %>%
  keep(is.numeric) %>%                     # Keep only numeric columns
  gather() %>%                             # Convert to key-value pairs
  ggplot(aes(value)) +                     # Plot the values
    facet_wrap(~ key, scales = "free") +   # In separate panels
    geom_density()                         # as density

## density diagram for all the variables
bankcampaigndata %>%
  keep(is.numeric) %>%                     # Keep only numeric columns
  gather() %>%                             # Convert to key-value pairs
  ggplot(aes(value)) +                     # Plot the values
    facet_wrap(~ key, scales = "free") +   # In separate panels
    geom_bar() 

## density diagram for all the variables

mycor <- ggcorr(bankcampaigndata)
## Warning in ggcorr(bankcampaigndata): data in column(s) 'CUST_ID', 'GENDER',
## 'OCCUPATION', 'ACC_TYPE', 'ACC_OP_DATE' are not numeric and were ignored
mycor

### Data Preparation & Hypothesis Statement Validatio

Scaling the variable here. Data values are in different scales and hence following features have been chosen for scaling. BALANCE AMT_ATM_DR AVG_AMT_PER_MOB_TXN AVG_AMT_PER_NET_TXN AVG_AMT_PER_CHQ_TXN AVG_AMT_PER_CSH_WDL_TXN AVG_AMT_PER_ATM_TXN AMT_L_DR AMT_MOB_DR AMT_NET_DR AMT_CHQ_DR AMT_BR_CSH_WDL_DR AMT_ATM_DR

bankcampaigndata$BALANCE <- scale(bankcampaigndata$BALANCE)
bankcampaigndata$AMT_ATM_DR <-scale(bankcampaigndata$AMT_ATM_DR)
bankcampaigndata$AVG_AMT_PER_MOB_TXN <-scale(bankcampaigndata$AVG_AMT_PER_MOB_TXN)
bankcampaigndata$AVG_AMT_PER_NET_TXN <-scale(bankcampaigndata$AVG_AMT_PER_NET_TXN)
bankcampaigndata$AVG_AMT_PER_CHQ_TXN <-scale(bankcampaigndata$AVG_AMT_PER_CHQ_TXN)
bankcampaigndata$AVG_AMT_PER_CSH_WDL_TXN <-scale(bankcampaigndata$AVG_AMT_PER_CSH_WDL_TXN)
bankcampaigndata$AVG_AMT_PER_ATM_TXN <-scale(bankcampaigndata$AVG_AMT_PER_ATM_TXN)
bankcampaigndata$AMT_L_DR <-scale(bankcampaigndata$AMT_L_DR)
bankcampaigndata$AMT_MOB_DR <-scale(bankcampaigndata$AMT_MOB_DR)
bankcampaigndata$AMT_NET_DR <-scale(bankcampaigndata$AMT_NET_DR)
bankcampaigndata$AMT_CHQ_DR <-scale(bankcampaigndata$AMT_CHQ_DR)
bankcampaigndata$AMT_BR_CSH_WDL_DR <-scale(bankcampaigndata$AMT_BR_CSH_WDL_DR)
bankcampaigndata$AMT_ATM_DR <-scale(bankcampaigndata$AMT_ATM_DR)
##mydatac$AMT_ATM_DR

Convert the factor variables into dummy variables. We need to drop AGE,GENDER,OCCUPATION and ACC_TYPE original columns as dummay variables for them already been created.

bankcampaigndata_analysis <- dummy.data.frame(bankcampaigndata[,-1], sep = ".")
str(bankcampaigndata_analysis)
## 'data.frame':    20000 obs. of  44 variables:
##  $ TARGET                  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ AGE                     : int  27 47 40 53 36 42 30 53 42 30 ...
##  $ GENDER.F                : int  0 0 0 0 0 1 0 1 1 0 ...
##  $ GENDER.M                : int  1 1 1 1 1 0 1 0 0 1 ...
##  $ GENDER.O                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ BALANCE                 : num [1:20000, 1] -0.702 -0.309 -0.682 -0.608 1.603 ...
##   ..- attr(*, "scaled:center")= num 511362
##   ..- attr(*, "scaled:scale")= num 723593
##  $ OCCUPATION.PROF         : int  0 0 0 0 1 1 1 0 0 1 ...
##  $ OCCUPATION.SAL          : int  0 1 0 1 0 0 0 1 0 0 ...
##  $ OCCUPATION.SELF-EMP     : int  1 0 1 0 0 0 0 0 1 0 ...
##  $ OCCUPATION.SENP         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ SCR                     : int  776 324 603 196 167 493 479 562 105 170 ...
##  $ HOLDING_PERIOD          : int  30 28 2 13 24 26 14 25 15 13 ...
##  $ ACC_TYPE.CA             : int  0 0 0 1 0 0 0 1 0 0 ...
##  $ ACC_TYPE.SA             : int  1 1 1 0 1 1 1 0 1 1 ...
##  $ ACC_OP_DATE             : Date, format: "2020-03-23" "2020-10-11" ...
##  $ LEN_OF_RLTN_IN_MNTH     : int  146 104 61 107 185 192 177 99 88 111 ...
##  $ NO_OF_L_CR_TXNS         : int  7 8 10 36 20 5 6 14 18 14 ...
##  $ NO_OF_L_DR_TXNS         : int  3 2 5 14 1 2 6 3 14 8 ...
##  $ TOT_NO_OF_L_TXNS        : int  10 10 15 50 21 7 12 17 32 22 ...
##  $ NO_OF_BR_CSH_WDL_DR_TXNS: int  0 0 1 4 1 1 0 3 6 3 ...
##  $ NO_OF_ATM_DR_TXNS       : int  1 1 1 2 0 1 1 0 2 1 ...
##  $ NO_OF_NET_DR_TXNS       : int  2 1 1 3 0 0 1 0 4 0 ...
##  $ NO_OF_MOB_DR_TXNS       : int  0 0 0 1 0 0 0 0 1 0 ...
##  $ NO_OF_CHQ_DR_TXNS       : int  0 0 2 4 0 0 4 0 1 4 ...
##  $ FLG_HAS_CC              : int  0 0 0 0 0 1 0 0 1 0 ...
##  $ AMT_ATM_DR              : num [1:20000, 1] 0.1379 -0.2868 0.0137 0.9873 -0.7181 ...
##   ..- attr(*, "scaled:center")= num 3.78e-17
##   ..- attr(*, "scaled:scale")= num 1
##  $ AMT_BR_CSH_WDL_DR       : num [1:20000, 1] -1.142 -1.142 0.551 0.89 1.297 ...
##   ..- attr(*, "scaled:center")= num 378474
##   ..- attr(*, "scaled:scale")= num 331418
##  $ AMT_CHQ_DR              : num [1:20000, 1] -0.283 -0.283 -0.171 -0.145 -0.283 ...
##   ..- attr(*, "scaled:center")= num 124520
##   ..- attr(*, "scaled:scale")= num 439380
##  $ AMT_NET_DR              : num [1:20000, 1] 2.308 1.763 2.383 1.58 -0.744 ...
##   ..- attr(*, "scaled:center")= num 237308
##   ..- attr(*, "scaled:scale")= num 319068
##  $ AMT_MOB_DR              : num [1:20000, 1] -0.453 -0.453 -0.453 0.989 -0.453 ...
##   ..- attr(*, "scaled:center")= num 22425
##   ..- attr(*, "scaled:scale")= num 49485
##  $ AMT_L_DR                : num [1:20000, 1] 0.2988 0.0459 1.1864 1.1221 0.0488 ...
##   ..- attr(*, "scaled:center")= num 773717
##   ..- attr(*, "scaled:scale")= num 712659
##  $ FLG_HAS_ANY_CHGS        : int  0 1 1 0 0 0 1 0 0 0 ...
##  $ AMT_OTH_BK_ATM_USG_CHGS : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ AMT_MIN_BAL_NMC_CHGS    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ NO_OF_IW_CHQ_BNC_TXNS   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ NO_OF_OW_CHQ_BNC_TXNS   : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ AVG_AMT_PER_ATM_TXN     : num [1:20000, 1] 0.789 -0.112 0.526 0.782 -1.028 ...
##   ..- attr(*, "scaled:center")= num 7409
##   ..- attr(*, "scaled:scale")= num 7210
##  $ AVG_AMT_PER_CSH_WDL_TXN : num [1:20000, 1] -0.898 -0.898 1.182 -0.274 2.098 ...
##   ..- attr(*, "scaled:center")= num 242236
##   ..- attr(*, "scaled:scale")= num 269838
##  $ AVG_AMT_PER_CHQ_TXN     : num [1:20000, 1] -0.50808 -0.50808 -0.00876 -0.20041 -0.50808 ...
##   ..- attr(*, "scaled:center")= num 25092
##   ..- attr(*, "scaled:scale")= num 49387
##  $ AVG_AMT_PER_NET_TXN     : num [1:20000, 1] 1.092 2.204 2.906 0.242 -0.636 ...
##   ..- attr(*, "scaled:center")= num 179059
##   ..- attr(*, "scaled:scale")= num 281707
##  $ AVG_AMT_PER_MOB_TXN     : num [1:20000, 1] -0.437 -0.437 -0.437 1.099 -0.437 ...
##   ..- attr(*, "scaled:center")= num 20304
##   ..- attr(*, "scaled:scale")= num 46473
##  $ FLG_HAS_NOMINEE         : int  1 1 1 1 1 1 0 1 1 0 ...
##  $ FLG_HAS_OLD_LOAN        : int  1 0 1 0 0 1 1 1 1 0 ...
##  $ random                  : num  1.14e-05 1.11e-04 1.20e-04 1.37e-04 1.74e-04 ...
##  - attr(*, "dummies")=List of 3
##   ..$ GENDER    : int  3 4 5
##   ..$ OCCUPATION: int  7 8 9 10
##   ..$ ACC_TYPE  : int  13 14
###changing the column name to meaningfull one
names(bankcampaigndata_analysis)[names(bankcampaigndata_analysis) == "OCCUPATION.PROF"] <- "OCCUPROF"
names(bankcampaigndata_analysis)[names(bankcampaigndata_analysis) == "OCCUPATION.SAL"] <- "OCCUSAL"
names(bankcampaigndata_analysis)[names(bankcampaigndata_analysis) == "OCCUPATION.SELF-EMP"] <- "OCCUSELFEMP"
names(bankcampaigndata_analysis)[names(bankcampaigndata_analysis) == "OCCUPATION.SENP"] <- "OCCUSENP"

### Split the data into test and training dataset.. Here we have assumed ratio as 75% and 25% between training and test dataset
MyIndex  <- sample(1:nrow(bankcampaigndata_analysis), size= 0.25*nrow(bankcampaigndata_analysis))

bankcampaigndata_analysis_test <- bankcampaigndata_analysis[MyIndex,]
nrow(bankcampaigndata_analysis_test)
## [1] 5000
bankcampaigndata_analysis_training <- bankcampaigndata_analysis[-MyIndex,]
nrow(bankcampaigndata_analysis_training)
## [1] 15000
str(bankcampaigndata_analysis_training)
## 'data.frame':    15000 obs. of  44 variables:
##  $ TARGET                  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ AGE                     : int  27 40 53 36 42 30 53 42 30 43 ...
##  $ GENDER.F                : int  0 0 0 0 1 0 1 1 0 0 ...
##  $ GENDER.M                : int  1 1 1 1 0 1 0 0 1 1 ...
##  $ GENDER.O                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ BALANCE                 : num [1:15000, 1] -0.702 -0.6815 -0.6076 1.6035 0.0143 ...
##  $ OCCUPROF                : int  0 0 0 1 1 1 0 0 1 1 ...
##  $ OCCUSAL                 : int  0 0 1 0 0 0 1 0 0 0 ...
##  $ OCCUSELFEMP             : int  1 1 0 0 0 0 0 1 0 0 ...
##  $ OCCUSENP                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ SCR                     : int  776 603 196 167 493 479 562 105 170 164 ...
##  $ HOLDING_PERIOD          : int  30 2 13 24 26 14 25 15 13 24 ...
##  $ ACC_TYPE.CA             : int  0 0 1 0 0 0 1 0 0 1 ...
##  $ ACC_TYPE.SA             : int  1 1 0 1 1 1 0 1 1 0 ...
##  $ ACC_OP_DATE             : Date, format: "2020-03-23" "2020-04-26" ...
##  $ LEN_OF_RLTN_IN_MNTH     : int  146 61 107 185 192 177 99 88 111 183 ...
##  $ NO_OF_L_CR_TXNS         : int  7 10 36 20 5 6 14 18 14 4 ...
##  $ NO_OF_L_DR_TXNS         : int  3 5 14 1 2 6 3 14 8 4 ...
##  $ TOT_NO_OF_L_TXNS        : int  10 15 50 21 7 12 17 32 22 8 ...
##  $ NO_OF_BR_CSH_WDL_DR_TXNS: int  0 1 4 1 1 0 3 6 3 2 ...
##  $ NO_OF_ATM_DR_TXNS       : int  1 1 2 0 1 1 0 2 1 1 ...
##  $ NO_OF_NET_DR_TXNS       : int  2 1 3 0 0 1 0 4 0 0 ...
##  $ NO_OF_MOB_DR_TXNS       : int  0 0 1 0 0 0 0 1 0 1 ...
##  $ NO_OF_CHQ_DR_TXNS       : int  0 2 4 0 0 4 0 1 4 0 ...
##  $ FLG_HAS_CC              : int  0 0 0 0 1 0 0 1 0 1 ...
##  $ AMT_ATM_DR              : num [1:15000, 1] 0.1379 0.0137 0.9873 -0.7181 0.4907 ...
##  $ AMT_BR_CSH_WDL_DR       : num [1:15000, 1] -1.14199 0.5511 0.89046 1.29747 0.00252 ...
##  $ AMT_CHQ_DR              : num [1:15000, 1] -0.283 -0.171 -0.145 -0.283 -0.283 ...
##  $ AMT_NET_DR              : num [1:15000, 1] 2.308 2.383 1.58 -0.744 -0.744 ...
##  $ AMT_MOB_DR              : num [1:15000, 1] -0.453 -0.453 0.989 -0.453 -0.453 ...
##  $ AMT_L_DR                : num [1:15000, 1] 0.2988 1.1864 1.1221 0.0488 -0.5275 ...
##  $ FLG_HAS_ANY_CHGS        : int  0 1 0 0 0 1 0 0 0 0 ...
##  $ AMT_OTH_BK_ATM_USG_CHGS : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ AMT_MIN_BAL_NMC_CHGS    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ NO_OF_IW_CHQ_BNC_TXNS   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ NO_OF_OW_CHQ_BNC_TXNS   : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ AVG_AMT_PER_ATM_TXN     : num [1:15000, 1] 0.789 0.526 0.782 -1.028 1.538 ...
##  $ AVG_AMT_PER_CSH_WDL_TXN : num [1:15000, 1] -0.898 1.182 -0.274 2.098 0.508 ...
##  $ AVG_AMT_PER_CHQ_TXN     : num [1:15000, 1] -0.50808 -0.00876 -0.20041 -0.50808 -0.50808 ...
##  $ AVG_AMT_PER_NET_TXN     : num [1:15000, 1] 1.092 2.906 0.242 -0.636 -0.636 ...
##  $ AVG_AMT_PER_MOB_TXN     : num [1:15000, 1] -0.437 -0.437 1.099 -0.437 -0.437 ...
##  $ FLG_HAS_NOMINEE         : int  1 1 1 1 1 0 1 1 0 1 ...
##  $ FLG_HAS_OLD_LOAN        : int  1 1 0 0 1 1 1 1 0 0 ...
##  $ random                  : num  1.14e-05 1.20e-04 1.37e-04 1.74e-04 4.06e-04 ...
##  - attr(*, "dummies")=List of 3
##   ..$ GENDER    : int  3 4 5
##   ..$ OCCUPATION: int  7 8 9 10
##   ..$ ACC_TYPE  : int  13 14

Doing the Anova for our hypothesis. A statistical hypothesis is an assumption about population parameters. This assumption may or may not be true. Hypothesis testing refers to the formal procedures used by statisticians to accept or reject statistical hypotheses.

Null Hypothesis: there is no relationship between all these input variables and new cases of target response. Alternative Hypothesis: there is a relationship between target response and new cases target response

Our Hypothesis acceptance criteria will be based on p-value with 95% confidence interval. So input variables having p-value less than 0.05 will be accepted to have statistically significant impact on predictor variable.The objective of the ANOVA test is to analyze if there is a (statistically) significant difference in Target, between different other input variables.

Random sample has been taken from the available dataset and anova has been executed with all input variables.Input variables having p-value < 0.05 and not have significant impact on response variable Target. –AMT_CHQ_DR –AMT_OTH_BK_ATM_USG_CHGS –NO_OF_IW_CHQ_BNC_TXNS –NO_OF_OW_CHQ_BNC_TXNS –AVG_AMNT_PER_CHQ_TXN

bankcampaigndata.aov <- aov(TARGET ~., bankcampaigndata_analysis_training)
summary(bankcampaigndata.aov)
##                             Df Sum Sq Mean Sq F value   Pr(>F)    
## AGE                          1    1.3    1.29  13.020 0.000309 ***
## GENDER.F                     1    3.5    3.50  35.284 2.91e-09 ***
## GENDER.M                     1    0.4    0.36   3.583 0.058384 .  
## BALANCE                      1    6.4    6.36  64.016 1.32e-15 ***
## OCCUPROF                     1    0.2    0.17   1.678 0.195164    
## OCCUSAL                      1    9.3    9.27  93.318  < 2e-16 ***
## OCCUSELFEMP                  1   13.0   13.01 131.008  < 2e-16 ***
## SCR                          1    8.5    8.54  86.046  < 2e-16 ***
## HOLDING_PERIOD               1   30.0   30.04 302.490  < 2e-16 ***
## ACC_TYPE.CA                  1    6.8    6.83  68.794  < 2e-16 ***
## ACC_OP_DATE                  1    0.1    0.15   1.463 0.226456    
## LEN_OF_RLTN_IN_MNTH          1    0.3    0.29   2.888 0.089259 .  
## NO_OF_L_CR_TXNS              1   11.8   11.76 118.479  < 2e-16 ***
## NO_OF_L_DR_TXNS              1    0.2    0.23   2.286 0.130528    
## TOT_NO_OF_L_TXNS             1   47.0   46.97 473.048  < 2e-16 ***
## NO_OF_BR_CSH_WDL_DR_TXNS     1    1.9    1.89  19.004 1.31e-05 ***
## NO_OF_ATM_DR_TXNS            1    0.2    0.18   1.800 0.179784    
## NO_OF_NET_DR_TXNS            1    0.2    0.20   2.043 0.152917    
## NO_OF_MOB_DR_TXNS            1    0.7    0.72   7.208 0.007264 ** 
## FLG_HAS_CC                   1   14.1   14.08 141.781  < 2e-16 ***
## AMT_ATM_DR                   1    0.4    0.43   4.370 0.036602 *  
## AMT_BR_CSH_WDL_DR            1    0.7    0.68   6.833 0.008960 ** 
## AMT_CHQ_DR                   1    0.1    0.07   0.751 0.386280    
## AMT_NET_DR                   1    0.4    0.39   3.947 0.046971 *  
## AMT_MOB_DR                   1    0.1    0.09   0.892 0.344953    
## FLG_HAS_ANY_CHGS             1    1.1    1.13  11.346 0.000758 ***
## AMT_OTH_BK_ATM_USG_CHGS      1    0.0    0.01   0.099 0.753266    
## AMT_MIN_BAL_NMC_CHGS         1    0.3    0.27   2.673 0.102061    
## NO_OF_IW_CHQ_BNC_TXNS        1    0.0    0.00   0.003 0.954476    
## NO_OF_OW_CHQ_BNC_TXNS        1    0.0    0.00   0.000 0.990425    
## AVG_AMT_PER_ATM_TXN          1    0.7    0.72   7.239 0.007140 ** 
## AVG_AMT_PER_CSH_WDL_TXN      1    0.1    0.11   1.136 0.286592    
## AVG_AMT_PER_CHQ_TXN          1    0.2    0.24   2.382 0.122788    
## AVG_AMT_PER_NET_TXN          1    0.3    0.28   2.802 0.094147 .  
## AVG_AMT_PER_MOB_TXN          1    0.1    0.09   0.905 0.341586    
## FLG_HAS_NOMINEE              1    0.4    0.41   4.156 0.041502 *  
## FLG_HAS_OLD_LOAN             1    0.3    0.32   3.226 0.072510 .  
## random                       1    0.0    0.02   0.237 0.626485    
## Residuals                14961 1485.5    0.10                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Modelling

First Dry Run of random forest is run with following parameter and we have plotted the random forest object w.r.t error. ***Optimal no of Tree found from the following plot is selected as 40 where error has been stabilized

ntree : number of trees to grow; 80; mtry : number of variables selected at each split, By default, mtry = floor of (sqrt of number of independent variables) for classification model. But here we have taken as 80; nodesize : minimum size of terminal nodes; we have taken value of 7

mybankRF <- randomForest(TARGET ~., data= bankcampaigndata_analysis_training, ntree=80, mtry=3, nodesize=7,importance=TRUE)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
print(mybankRF)
## 
## Call:
##  randomForest(formula = TARGET ~ ., data = bankcampaigndata_analysis_training,      ntree = 80, mtry = 3, nodesize = 7, importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 80
## No. of variables tried at each split: 3
## 
##           Mean of squared residuals: 0.04645676
##                     % Var explained: 57.68
plot(mybankRF, main="Random Forest Tree Optimal Diagram",col = "dark red")
legend("topright", c("OOB","0","1"), text.col = 1:6,lty = 1:3,col = 1:3)  ## So our optimium no of tree 20

Important Variable detection is being done here. The extractor function for variable importance measures as produced by randomForest and then we have sorted by Gini Index. ****Note: All the variables up to GiniIndex value of 20 being considered as key variables Key Variables for Model BALANCE SCR(Generic Marketing Score) AMT_L_DR(Total Amount Debited) ACC_OP_DATE(Account Opening Date) HOLDING_PERIOD(Ability to hold money in the account) LEN_OF_RLTN_IN_MNTH(Length of Relationship in Months) TOT_NO_OF_L_TXNS(Total No. of Transaction) NO_OF_L_CR_TXNS(No. of Credit Transactions) AGE AMT_BR_CSH_WDL_DR(Amount cash withdrawn from Branch) AVG_AMT_PER_CSH_WDL_TXN(Avg. Amt withdrawn per Cash Withdrawal Transaction) AMT_ATM_DR(Amount Withdrawn from ATM) AVG_AMT_PER_ATM_TXN(Avg. Amt withdrawn per ATM Transaction AVG_AMT_PER_CHQ_TXN(Avg. Amt debited per Cheque Transaction AMT_CHQ_DR(Amount debited by Cheque Transactions) AMT_NET_DR(Amount debited by Net Transactions) AVG_AMT_PER_NET_TXN(Avg. Amt debited per Net Transaction) NO_OF_L_DR_TXNS(No. of Debit Transactions) NO_OF_BR_CSH_WDL_DR_TXNS(No. of Branch Cash Withdrawal Transactions) NO_OF_CHQ_DR_TXNS(No. of Cheque Debit Transactions) AMT_MOB_DR(Amount debited by Mobile Banking Transactions) AVG_AMT_PER_MOB_TXN(Avg. Amt debited per Mobile Banking Transaction) OCCUSELFEMP(Employed as profession FEMP)

This measure is the total decrease in node impurities from splitting on the variable, averaged over all trees. For classification, the node impurity is measured by the Gini index. For regression, it is measured by residual sum of squares.

impvar <- round(randomForest::importance(mybankRF),2)
impvar[order(impvar[,2],decreasing = TRUE),]  ### Balance and SCR are most 
##                          %IncMSE IncNodePurity
## SCR                        15.75         76.82
## BALANCE                    20.66         75.85
## ACC_OP_DATE                18.55         69.24
## AMT_L_DR                   15.57         65.67
## LEN_OF_RLTN_IN_MNTH        20.08         65.44
## HOLDING_PERIOD             15.49         64.16
## AGE                        20.82         58.07
## TOT_NO_OF_L_TXNS           17.24         57.92
## NO_OF_L_CR_TXNS            15.80         56.74
## AMT_BR_CSH_WDL_DR          15.23         56.03
## AVG_AMT_PER_CSH_WDL_TXN    16.36         54.44
## AVG_AMT_PER_ATM_TXN        13.23         54.33
## AMT_ATM_DR                  9.81         52.67
## AMT_CHQ_DR                 13.53         50.13
## AVG_AMT_PER_CHQ_TXN        18.68         47.78
## AMT_NET_DR                  8.34         40.76
## random                     -0.63         39.56
## AVG_AMT_PER_NET_TXN         6.96         39.37
## NO_OF_L_DR_TXNS             9.77         35.98
## NO_OF_BR_CSH_WDL_DR_TXNS   11.93         28.79
## NO_OF_CHQ_DR_TXNS          10.15         25.93
## AMT_MOB_DR                  8.42         23.07
## AVG_AMT_PER_MOB_TXN         9.26         22.21
## OCCUSELFEMP                13.72         21.48
## FLG_HAS_CC                 14.91         18.90
## NO_OF_ATM_DR_TXNS           7.54         17.30
## NO_OF_NET_DR_TXNS           8.51         15.30
## OCCUSAL                    10.77         12.97
## FLG_HAS_OLD_LOAN           12.79         11.67
## OCCUSENP                    9.60          9.50
## GENDER.F                    6.28          9.22
## GENDER.M                    6.49          9.12
## FLG_HAS_ANY_CHGS           11.35          8.79
## OCCUPROF                    8.75          8.72
## ACC_TYPE.SA                 8.70          7.56
## ACC_TYPE.CA                 7.19          7.24
## FLG_HAS_NOMINEE            12.04          6.50
## NO_OF_MOB_DR_TXNS           5.67          6.33
## NO_OF_OW_CHQ_BNC_TXNS       8.39          5.08
## NO_OF_IW_CHQ_BNC_TXNS       8.07          4.87
## GENDER.O                    5.40          2.52
## AMT_MIN_BAL_NMC_CHGS        4.65          1.19
## AMT_OTH_BK_ATM_USG_CHGS     3.34          0.93
varImpPlot(mybankRF)  ## PLOTTING THE VI

Finding Optimal mtry by the tuneRF function. ***Optimal no of mtry found from the following plot is selected as 13 where OOB error has been minimized and then it has started increasing again.

mybankTuneRF <- tuneRF(x=bankcampaigndata_analysis_training[,-1],  ### so optimised mtry is 5
                       y=bankcampaigndata_analysis_training$TARGET,
                       mtryStart= 3,
                       ntreeTry=20,
                       stepFactor=1.5,
                       improve=0.0001,
                       trace=TRUE,
                       plot = TRUE,
                       doBest = TRUE,
                       nodesize=10,
                       importance=TRUE
)
## Warning in randomForest.default(x, y, mtry = mtryStart, ntree = ntreeTry, :
## The response has five or fewer unique values. Are you sure you want to do
## regression?
## mtry = 3  OOB error = 0.05601554 
## Searching left ...
## Warning in randomForest.default(x, y, mtry = mtryCur, ntree = ntreeTry, :
## The response has five or fewer unique values. Are you sure you want to do
## regression?
## mtry = 2     OOB error = 0.06335749 
## -0.1310699 1e-04 
## Searching right ...
## Warning in randomForest.default(x, y, mtry = mtryCur, ntree = ntreeTry, :
## The response has five or fewer unique values. Are you sure you want to do
## regression?
## mtry = 4     OOB error = 0.05381888 
## 0.03921525 1e-04
## Warning in randomForest.default(x, y, mtry = mtryCur, ntree = ntreeTry, :
## The response has five or fewer unique values. Are you sure you want to do
## regression?
## mtry = 6     OOB error = 0.05187122 
## 0.03618905 1e-04
## Warning in randomForest.default(x, y, mtry = mtryCur, ntree = ntreeTry, :
## The response has five or fewer unique values. Are you sure you want to do
## regression?
## mtry = 9     OOB error = 0.05032635 
## 0.02978296 1e-04
## Warning in randomForest.default(x, y, mtry = mtryCur, ntree = ntreeTry, :
## The response has five or fewer unique values. Are you sure you want to do
## regression?
## mtry = 13    OOB error = 0.04851986 
## 0.03589536 1e-04
## Warning in randomForest.default(x, y, mtry = mtryCur, ntree = ntreeTry, :
## The response has five or fewer unique values. Are you sure you want to do
## regression?
## mtry = 19    OOB error = 0.04926101 
## -0.0152751 1e-04
## Warning in randomForest.default(x, y, mtry = res[which.min(res[, 2]), 1], :
## The response has five or fewer unique values. Are you sure you want to do
## regression?

Now we have Important variable, Optimal no of Tree and mtry, so new random forest object has to be generated with these revised values.

mybankVIFinalRF <- randomForest(TARGET ~ BALANCE+ SCR+AMT_L_DR+ACC_OP_DATE+HOLDING_PERIOD+LEN_OF_RLTN_IN_MNTH+TOT_NO_OF_L_TXNS+NO_OF_L_CR_TXNS+AGE+AMT_BR_CSH_WDL_DR+AVG_AMT_PER_CSH_WDL_TXN+AMT_ATM_DR+AVG_AMT_PER_ATM_TXN+AVG_AMT_PER_CHQ_TXN+AMT_CHQ_DR+AMT_NET_DR+AVG_AMT_PER_NET_TXN+NO_OF_L_DR_TXNS+NO_OF_BR_CSH_WDL_DR_TXNS+NO_OF_CHQ_DR_TXNS+AMT_MOB_DR+AVG_AMT_PER_MOB_TXN+OCCUSELFEMP , data= bankcampaigndata_analysis_training, ntree=40, mtry=13, nodesize=7,importance=TRUE)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
print(mybankVIFinalRF)
## 
## Call:
##  randomForest(formula = TARGET ~ BALANCE + SCR + AMT_L_DR + ACC_OP_DATE +      HOLDING_PERIOD + LEN_OF_RLTN_IN_MNTH + TOT_NO_OF_L_TXNS +      NO_OF_L_CR_TXNS + AGE + AMT_BR_CSH_WDL_DR + AVG_AMT_PER_CSH_WDL_TXN +      AMT_ATM_DR + AVG_AMT_PER_ATM_TXN + AVG_AMT_PER_CHQ_TXN +      AMT_CHQ_DR + AMT_NET_DR + AVG_AMT_PER_NET_TXN + NO_OF_L_DR_TXNS +      NO_OF_BR_CSH_WDL_DR_TXNS + NO_OF_CHQ_DR_TXNS + AMT_MOB_DR +      AVG_AMT_PER_MOB_TXN + OCCUSELFEMP, data = bankcampaigndata_analysis_training,      ntree = 40, mtry = 13, nodesize = 7, importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 40
## No. of variables tried at each split: 13
## 
##           Mean of squared residuals: 0.04090712
##                     % Var explained: 62.74
plot(mybankVIFinalRF, main="Vairable of Importance Random Forest Tree Final Optimal Diagram")

print("Tree Sizes are")
## [1] "Tree Sizes are"
treesize(mybankVIFinalRF)
##  [1] 2652 2496 2479 2485 2552 2423 2593 2550 2491 2376 2524 2596 2405 2333
## [15] 2576 2379 2576 2361 2534 2324 2555 2624 2391 2657 2423 2591 2282 2524
## [29] 2471 2346 2432 2465 2529 2612 2410 2594 2613 2511 2421 2626

Model Performance Evaluation

Confusion Matrix is one aspect of Model Performance measurement. Measurement on Training Dataset: Input data is biased towards Target 0 and hence the accuracy % will not be correct measurement to indicate model effectiveness. Key criteria of Model Performance measurement for such biased data is “Neg Pred Value” which indicates % of “Negative Class” predicted correctly and correct predicted value of 1 is 96.99% Measurement on Test Dataset: Key criteria of Model Performance measurement for such biased data is “Neg Pred Value” which indicates % of “Negative Class” predicted correctly and correct predicted value of 1 is 72.04%

bankcampaigndata_analysis_training$predictVITargetProb <- predict(mybankVIFinalRF,bankcampaigndata_analysis_training, type='response')
bankcampaigndata_analysis_training$predictVITarget <- ifelse(bankcampaigndata_analysis_training$predictVITargetProb <0.5,0,1)

mycamaigntVIable <- table(actualclass=bankcampaigndata_analysis_training$TARGET, predictedclass=bankcampaigndata_analysis_training$predictVITarget)
mycampaignconfusionVImatrix <- confusionMatrix(mycamaigntVIable)
print(mycampaignconfusionVImatrix)
## Confusion Matrix and Statistics
## 
##            predictedclass
## actualclass     0     1
##           0 13117     0
##           1    54  1829
##                                           
##                Accuracy : 0.9964          
##                  95% CI : (0.9953, 0.9973)
##     No Information Rate : 0.8781          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9834          
##  Mcnemar's Test P-Value : 5.498e-13       
##                                           
##             Sensitivity : 0.9959          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.9713          
##              Prevalence : 0.8781          
##          Detection Rate : 0.8745          
##    Detection Prevalence : 0.8745          
##       Balanced Accuracy : 0.9980          
##                                           
##        'Positive' Class : 0               
## 
bankcampaigndata_analysis_test$predictVITargetProb <- predict(mybankVIFinalRF,bankcampaigndata_analysis_test, type='response')
bankcampaigndata_analysis_test$predictVITarget <- ifelse(bankcampaigndata_analysis_test$predictVITargetProb <0.5,0,1)

mycamaigntableVITest <- table(actualclass=bankcampaigndata_analysis_test$TARGET, predictedclass=bankcampaigndata_analysis_test$predictVITarget)
mycampaignconfusionmatrixVITest <- confusionMatrix(mycamaigntableVITest)   ### Neg Pred Value is quite less here
print(mycampaignconfusionmatrixVITest)
## Confusion Matrix and Statistics
## 
##            predictedclass
## actualclass    0    1
##           0 4356   15
##           1  183  446
##                                           
##                Accuracy : 0.9604          
##                  95% CI : (0.9546, 0.9656)
##     No Information Rate : 0.9078          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7967          
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9597          
##             Specificity : 0.9675          
##          Pos Pred Value : 0.9966          
##          Neg Pred Value : 0.7091          
##              Prevalence : 0.9078          
##          Detection Rate : 0.8712          
##    Detection Prevalence : 0.8742          
##       Balanced Accuracy : 0.9636          
##                                           
##        'Positive' Class : 0               
## 

Gain & Lift Chart if another aspect of Model Performance measurement. Lift and Gain Charts are a useful way of visualizing how good a predictive model is Measurement on Test Dataset: Cumulative total response rate is 69.1% for the target where response is received

bankcampaigndata_analysis_test$kpi <- predict(mybankVIFinalRF,bankcampaigndata_analysis_test, type='response')

newdata<- bankcampaigndata_analysis_test[
  with(bankcampaigndata_analysis_test, order(-kpi)),
]

mydecile <- decile(newdata$kpi)
mygain <- gains(bankcampaigndata_analysis_test$TARGET, bankcampaigndata_analysis_test$predictVITarget,groups=10)
## Warning in gains(bankcampaigndata_analysis_test$TARGET,
## bankcampaigndata_analysis_test$predictVITarget, : Warning: Fewer distinct
## predicted values than groups requested
print(mygain)
## Depth                            Cume   Cume Pct                     Mean
##  of           Cume     Mean      Mean   of Total    Lift   Cume     Model
## File     N      N      Resp      Resp      Resp    Index   Lift     Score
## -------------------------------------------------------------------------
##    9   461    461      0.97      0.97      70.9%     769    769      1.00
##  100  4539   5000      0.04      0.13     100.0%      32    100      0.00
plot(mygain)

### Conclusion & Recommendation Rank Order table on the test data set shows that we should target all the customers who are in decile 10,9 and 8. This will help us to achieve response rate of 93% response rate. Beyond this, the increase in cumulative response is becoming flat and marketing cost increase will not be able to justify the response % increase.

bankcampaigndata_analysis_test$deciles <- decile(bankcampaigndata_analysis_test$predictVITargetProb)

library(data.table)
tmp_DT = data.table(bankcampaigndata_analysis_test)
h_rank <- tmp_DT[, list(
  cnt = length(TARGET), 
  cnt_resp = sum(TARGET), 
  cnt_non_resp = sum(TARGET == 0)) , 
  by=deciles][order(-deciles)]
h_rank$rrate <- round (h_rank$cnt_resp / h_rank$cnt,2);
h_rank$cum_resp <- cumsum(h_rank$cnt_resp)
h_rank$cum_non_resp <- cumsum(h_rank$cnt_non_resp)
h_rank$cum_rel_resp <- round(h_rank$cum_resp / sum(h_rank$cnt_resp),2);
h_rank$cum_rel_non_resp <- round(h_rank$cum_non_resp / sum(h_rank$cnt_non_resp),2);
h_rank$ks <- abs(h_rank$cum_rel_resp - h_rank$cum_rel_non_resp);

View(h_rank)