The Bank executed a campaign to cross-sell Personal Loans. As part of their Pilot Campaign, 20000 customers were sent campaigns through email, sms, and direct mail. They were given an offer of Personal Loan at an attractive interest rate of 12% and processing fee waived off if they respond within 1 Month. 2512 customer expressed their interest and are marked as Target = 1. A model has to be build a Model using Supervised Learning Technique to finds profitable segments to target for cross-selling personal loans.
Brief Analysis on data set revealed following:
–Customer Id is mainly for bank internal processing and will not be relevant for our model building and recommendations –Random Number will also be not required for analysis –Age Bracket and Age seems to reflect similar data and we will only proceed with Age –Gender column has three unique values and the value “Other” is not sure if this is correct or unknown data of customers about genders. For the time being, the Other has been assumed as another gender which further needs to be validated with Business Team –Read CSV file package of R could not understand the field “ACC_OP_DATE” field and explicit date conversion by as.Date function –Data is biassed and 87% of data has target value of 0 while only 13% of data has target value of 1 –Age Bracket and Age seems to be same information and Age has been kept for further analysis –Following Features having Outliers NO_OF_L_CR_TXNS NO_OF_L_DR_TXNS TOT_NO_OF_L_TXNS NO_OF_BR_CSH_WDL_DR_TXNS NO_OF_ATM_DR_TXNS NO_OF_NET_DR_TXNS NO_OF_MOB_DR_TXNS NO_OF_CHQ_DR_TXNS AMT_ATM_DR.V1 AMT_CHQ_DR.V1 AMT_NET_DR.V1 AMT_MOB_DR.V1 AMT_L_DR.V1 AMT_OTH_BK_ATM_USG_CHGS AMT_MIN_BAL_NMC_CHGS NO_OF_IW_CHQ_BNC_TXNS NO_OF_OW_CHQ_BNC_TXNS AVG_AMT_PER_ATM_TXN.V1 AVG_AMT_PER_CSH_WDL_TXN.V1 AVG_AMT_PER_CHQ_TXN.V1 AVG_AMT_PER_NET_TXN.V1 AVG_AMT_PER_MOB_TXN.V1
# Load packages for analysis and this section will have all the required libraries mentioned for better clarity
library('ggplot2') # visualization
## Warning: package 'ggplot2' was built under R version 3.4.1
library('ggthemes') # visualization
## Warning: package 'ggthemes' was built under R version 3.4.1
library('scales') # visualization
## Warning: package 'scales' was built under R version 3.4.1
library('dplyr') # data manipulation
## Warning: package 'dplyr' was built under R version 3.4.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library('mice') # imputation
## Warning: package 'mice' was built under R version 3.4.2
## Loading required package: lattice
library('randomForest') # classification algorithm
## Warning: package 'randomForest' was built under R version 3.4.1
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library('rpart') # for decision tree
## Warning: package 'rpart' was built under R version 3.4.2
library('ROCR')
## Warning: package 'ROCR' was built under R version 3.4.1
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.4.1
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library('ROCR')
library('randomForest')
library('corrr')
## Warning: package 'corrr' was built under R version 3.4.1
library('corrplot')
## Warning: package 'corrplot' was built under R version 3.4.2
## corrplot 0.84 loaded
library('glue')
## Warning: package 'glue' was built under R version 3.4.2
##
## Attaching package: 'glue'
## The following object is masked from 'package:dplyr':
##
## collapse
library('caTools')
## Warning: package 'caTools' was built under R version 3.4.1
library('data.table')
## Warning: package 'data.table' was built under R version 3.4.2
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
require("GGally")
## Loading required package: GGally
## Warning: package 'GGally' was built under R version 3.4.3
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
require("geosphere")
## Loading required package: geosphere
## Warning: package 'geosphere' was built under R version 3.4.2
require("gmapsdistance")
## Loading required package: gmapsdistance
## Warning: package 'gmapsdistance' was built under R version 3.4.2
require("tidyr")
## Loading required package: tidyr
## Warning: package 'tidyr' was built under R version 3.4.2
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:mice':
##
## complete
library('corrplot')
#source("distance.R")
library('car')
## Warning: package 'car' was built under R version 3.4.2
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library('caret')
## Warning: package 'caret' was built under R version 3.4.3
library('gclus')
## Warning: package 'gclus' was built under R version 3.4.1
## Loading required package: cluster
## Warning: package 'cluster' was built under R version 3.4.2
library('visdat')
## Warning: package 'visdat' was built under R version 3.4.1
library('psych')
## Warning: package 'psych' was built under R version 3.4.2
##
## Attaching package: 'psych'
## The following object is masked from 'package:car':
##
## logit
## The following object is masked from 'package:randomForest':
##
## outlier
## The following objects are masked from 'package:scales':
##
## alpha, rescale
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library('leaflet')
## Warning: package 'leaflet' was built under R version 3.4.1
library('leaflet.extras')
## Warning: package 'leaflet.extras' was built under R version 3.4.1
library("PerformanceAnalytics")
## Warning: package 'PerformanceAnalytics' was built under R version 3.4.2
## Loading required package: xts
## Warning: package 'xts' was built under R version 3.4.1
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.4.1
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following object is masked from 'package:leaflet':
##
## addLegend
## The following objects are masked from 'package:data.table':
##
## first, last
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:gplots':
##
## textplot
## The following object is masked from 'package:graphics':
##
## legend
library('GPArotation')
## Warning: package 'GPArotation' was built under R version 3.4.1
library('MVN')
## Warning: package 'MVN' was built under R version 3.4.2
## sROC 0.1-2 loaded
##
## Attaching package: 'MVN'
## The following object is masked from 'package:psych':
##
## mardia
library('psych')
library('MASS')
## Warning: package 'MASS' was built under R version 3.4.1
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library('psy')
## Warning: package 'psy' was built under R version 3.4.1
##
## Attaching package: 'psy'
## The following object is masked from 'package:psych':
##
## wkappa
library('corpcor')
## Warning: package 'corpcor' was built under R version 3.4.1
library('fastmatch')
## Warning: package 'fastmatch' was built under R version 3.4.1
##
## Attaching package: 'fastmatch'
## The following object is masked from 'package:dplyr':
##
## coalesce
library('plyr')
## Warning: package 'plyr' was built under R version 3.4.1
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
library('car')
library("PerformanceAnalytics")
library('ggcorrplot')
## Warning: package 'ggcorrplot' was built under R version 3.4.2
library('cluster')
library('caTools')
library('rpart')
library('rpart.plot')
## Warning: package 'rpart.plot' was built under R version 3.4.2
library('rattle')
## Warning: package 'rattle' was built under R version 3.4.2
## Rattle: A free graphical interface for data science with R.
## Version 5.1.0 Copyright (c) 2006-2017 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
##
## Attaching package: 'rattle'
## The following object is masked from 'package:randomForest':
##
## importance
library('RColorBrewer')
## Warning: package 'RColorBrewer' was built under R version 3.4.1
library('data.table')
library('ROCR')
library('maptree')
## Warning: package 'maptree' was built under R version 3.4.2
library('tree')
## Warning: package 'tree' was built under R version 3.4.3
library('dummies') # for converting categorical into dummy one
## Warning: package 'dummies' was built under R version 3.4.1
## dummies-1.5.6 provided by Decision Patterns
library('caret')
library('pscl') ## for McFadden R2
## Warning: package 'pscl' was built under R version 3.4.3
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
library('randomForest')
library('StatMeasures')
## Warning: package 'StatMeasures' was built under R version 3.4.3
library('sqldf')
## Warning: package 'sqldf' was built under R version 3.4.3
## Loading required package: gsubfn
## Warning: package 'gsubfn' was built under R version 3.4.1
## Loading required package: proto
## Warning: package 'proto' was built under R version 3.4.1
## Loading required package: RSQLite
## Warning: package 'RSQLite' was built under R version 3.4.1
library('purrr')
## Warning: package 'purrr' was built under R version 3.4.3
##
## Attaching package: 'purrr'
## The following object is masked from 'package:plyr':
##
## compact
## The following object is masked from 'package:caret':
##
## lift
## The following object is masked from 'package:car':
##
## some
## The following object is masked from 'package:data.table':
##
## transpose
## The following object is masked from 'package:scales':
##
## discard
library('tidyr')
library('ggplot2')
library('gains')
## Warning: package 'gains' was built under R version 3.4.1
bankcampaigndata <- read.csv('bankdatafile.csv')
mydatac <- bankcampaigndata
print("No of Observation in the File:\n")
## [1] "No of Observation in the File:\n"
nrow(bankcampaigndata)
## [1] 20000
bankcampaigndata$ACC_OP_DATE <- as.Date(bankcampaigndata$ACC_OP_DATE,"%m/%d/%y")
str(bankcampaigndata)
## 'data.frame': 20000 obs. of 40 variables:
## $ CUST_ID : Factor w/ 20000 levels "C1","C10","C100",..: 17699 16532 11027 17984 2363 11747 18115 15556 15216 12494 ...
## $ TARGET : int 0 0 0 0 0 0 0 0 0 0 ...
## $ AGE : int 27 47 40 53 36 42 30 53 42 30 ...
## $ GENDER : Factor w/ 3 levels "F","M","O": 2 2 2 2 2 1 2 1 1 2 ...
## $ BALANCE : num 3384 287489 18217 71720 1671623 ...
## $ OCCUPATION : Factor w/ 4 levels "PROF","SAL","SELF-EMP",..: 3 2 3 2 1 1 1 2 3 1 ...
## $ AGE_BKT : Factor w/ 7 levels "<25",">50","26-30",..: 3 7 5 2 5 6 3 2 6 3 ...
## $ SCR : int 776 324 603 196 167 493 479 562 105 170 ...
## $ HOLDING_PERIOD : int 30 28 2 13 24 26 14 25 15 13 ...
## $ ACC_TYPE : Factor w/ 2 levels "CA","SA": 2 2 2 1 2 2 2 1 2 2 ...
## $ ACC_OP_DATE : Date, format: "2020-03-23" "2020-10-11" ...
## $ LEN_OF_RLTN_IN_MNTH : int 146 104 61 107 185 192 177 99 88 111 ...
## $ NO_OF_L_CR_TXNS : int 7 8 10 36 20 5 6 14 18 14 ...
## $ NO_OF_L_DR_TXNS : int 3 2 5 14 1 2 6 3 14 8 ...
## $ TOT_NO_OF_L_TXNS : int 10 10 15 50 21 7 12 17 32 22 ...
## $ NO_OF_BR_CSH_WDL_DR_TXNS: int 0 0 1 4 1 1 0 3 6 3 ...
## $ NO_OF_ATM_DR_TXNS : int 1 1 1 2 0 1 1 0 2 1 ...
## $ NO_OF_NET_DR_TXNS : int 2 1 1 3 0 0 1 0 4 0 ...
## $ NO_OF_MOB_DR_TXNS : int 0 0 0 1 0 0 0 0 1 0 ...
## $ NO_OF_CHQ_DR_TXNS : int 0 0 2 4 0 0 4 0 1 4 ...
## $ FLG_HAS_CC : int 0 0 0 0 0 1 0 0 1 0 ...
## $ AMT_ATM_DR : int 13100 6600 11200 26100 0 18500 6200 0 35400 18000 ...
## $ AMT_BR_CSH_WDL_DR : int 0 0 561120 673590 808480 379310 0 945160 198430 869880 ...
## $ AMT_CHQ_DR : int 0 0 49320 60780 0 0 10580 0 51490 32610 ...
## $ AMT_NET_DR : num 973557 799813 997570 741506 0 ...
## $ AMT_MOB_DR : int 0 0 0 71388 0 0 0 0 170332 0 ...
## $ AMT_L_DR : num 986657 806413 1619210 1573364 808480 ...
## $ FLG_HAS_ANY_CHGS : int 0 1 1 0 0 0 1 0 0 0 ...
## $ AMT_OTH_BK_ATM_USG_CHGS : int 0 0 0 0 0 0 0 0 0 0 ...
## $ AMT_MIN_BAL_NMC_CHGS : int 0 0 0 0 0 0 0 0 0 0 ...
## $ NO_OF_IW_CHQ_BNC_TXNS : int 0 0 0 0 0 0 0 0 0 0 ...
## $ NO_OF_OW_CHQ_BNC_TXNS : int 0 0 1 0 0 0 0 0 0 0 ...
## $ AVG_AMT_PER_ATM_TXN : num 13100 6600 11200 13050 0 ...
## $ AVG_AMT_PER_CSH_WDL_TXN : num 0 0 561120 168398 808480 ...
## $ AVG_AMT_PER_CHQ_TXN : num 0 0 24660 15195 0 ...
## $ AVG_AMT_PER_NET_TXN : num 486779 799813 997570 247169 0 ...
## $ AVG_AMT_PER_MOB_TXN : num 0 0 0 71388 0 ...
## $ FLG_HAS_NOMINEE : int 1 1 1 1 1 1 0 1 1 0 ...
## $ FLG_HAS_OLD_LOAN : int 1 0 1 0 0 1 1 1 1 0 ...
## $ random : num 1.14e-05 1.11e-04 1.20e-04 1.37e-04 1.74e-04 ...
bankcampaigndata <- within(bankcampaigndata, rm('AGE_BKT')) ## removing the age bracket column as this seems to redundent here
print("Categorisation of Target Data:\n")
## [1] "Categorisation of Target Data:\n"
table(bankcampaigndata$TARGET) ## Data seems to be biassed here
##
## 0 1
## 17488 2512
bankcampaigndata_analysis <- bankcampaigndata ## taking data into anothet frame for further manipulation
summary(bankcampaigndata)
## CUST_ID TARGET AGE GENDER
## C1 : 1 Min. :0.0000 Min. :21.00 F: 5433
## C10 : 1 1st Qu.:0.0000 1st Qu.:30.00 M:14376
## C100 : 1 Median :0.0000 Median :38.00 O: 191
## C1000 : 1 Mean :0.1256 Mean :38.42
## C10000 : 1 3rd Qu.:0.0000 3rd Qu.:46.00
## C10001 : 1 Max. :1.0000 Max. :55.00
## (Other):19994
## BALANCE OCCUPATION SCR HOLDING_PERIOD
## Min. : 0 PROF :5417 Min. :100.0 Min. : 1.00
## 1st Qu.: 64754 SAL :5855 1st Qu.:227.0 1st Qu.: 7.00
## Median : 231676 SELF-EMP:3568 Median :364.0 Median :15.00
## Mean : 511362 SENP :5160 Mean :440.2 Mean :14.96
## 3rd Qu.: 653877 3rd Qu.:644.0 3rd Qu.:22.00
## Max. :8360431 Max. :999.0 Max. :31.00
##
## ACC_TYPE ACC_OP_DATE LEN_OF_RLTN_IN_MNTH NO_OF_L_CR_TXNS
## CA: 4241 Min. :2019-01-02 Min. : 29.0 Min. : 0.00
## SA:15759 1st Qu.:2020-03-16 1st Qu.: 79.0 1st Qu.: 6.00
## Median :2020-06-20 Median :125.0 Median :10.00
## Mean :2020-06-09 Mean :125.2 Mean :12.35
## 3rd Qu.:2020-09-25 3rd Qu.:172.0 3rd Qu.:14.00
## Max. :2020-12-31 Max. :221.0 Max. :75.00
##
## NO_OF_L_DR_TXNS TOT_NO_OF_L_TXNS NO_OF_BR_CSH_WDL_DR_TXNS
## Min. : 0.000 Min. : 0.00 Min. : 0.000
## 1st Qu.: 2.000 1st Qu.: 9.00 1st Qu.: 1.000
## Median : 5.000 Median : 14.00 Median : 1.000
## Mean : 6.634 Mean : 18.98 Mean : 1.883
## 3rd Qu.: 7.000 3rd Qu.: 21.00 3rd Qu.: 2.000
## Max. :74.000 Max. :149.00 Max. :15.000
##
## NO_OF_ATM_DR_TXNS NO_OF_NET_DR_TXNS NO_OF_MOB_DR_TXNS NO_OF_CHQ_DR_TXNS
## Min. : 0.000 Min. : 0.000 Min. : 0.0000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.0000 1st Qu.: 0.000
## Median : 1.000 Median : 0.000 Median : 0.0000 Median : 2.000
## Mean : 1.029 Mean : 1.172 Mean : 0.4118 Mean : 2.138
## 3rd Qu.: 1.000 3rd Qu.: 1.000 3rd Qu.: 0.0000 3rd Qu.: 4.000
## Max. :25.000 Max. :22.000 Max. :25.0000 Max. :15.000
##
## FLG_HAS_CC AMT_ATM_DR AMT_BR_CSH_WDL_DR AMT_CHQ_DR
## Min. :0.0000 Min. : 0 Min. : 0 Min. : 0
## 1st Qu.:0.0000 1st Qu.: 0 1st Qu.: 2990 1st Qu.: 0
## Median :0.0000 Median : 6900 Median :340150 Median : 23840
## Mean :0.3054 Mean : 10990 Mean :378475 Mean : 124520
## 3rd Qu.:1.0000 3rd Qu.: 15800 3rd Qu.:674675 3rd Qu.: 72470
## Max. :1.0000 Max. :199300 Max. :999930 Max. :4928640
##
## AMT_NET_DR AMT_MOB_DR AMT_L_DR FLG_HAS_ANY_CHGS
## Min. : 0 Min. : 0 Min. : 0 Min. :0.0000
## 1st Qu.: 0 1st Qu.: 0 1st Qu.: 237936 1st Qu.:0.0000
## Median : 0 Median : 0 Median : 695115 Median :0.0000
## Mean :237308 Mean : 22425 Mean : 773717 Mean :0.1106
## 3rd Qu.:473971 3rd Qu.: 0 3rd Qu.:1078927 3rd Qu.:0.0000
## Max. :999854 Max. :199667 Max. :6514921 Max. :1.0000
##
## AMT_OTH_BK_ATM_USG_CHGS AMT_MIN_BAL_NMC_CHGS NO_OF_IW_CHQ_BNC_TXNS
## Min. : 0.000 Min. : 0.000 Min. :0.00000
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.:0.00000
## Median : 0.000 Median : 0.000 Median :0.00000
## Mean : 1.099 Mean : 1.292 Mean :0.04275
## 3rd Qu.: 0.000 3rd Qu.: 0.000 3rd Qu.:0.00000
## Max. :250.000 Max. :170.000 Max. :2.00000
##
## NO_OF_OW_CHQ_BNC_TXNS AVG_AMT_PER_ATM_TXN AVG_AMT_PER_CSH_WDL_TXN
## Min. :0.0000 Min. : 0 Min. : 0
## 1st Qu.:0.0000 1st Qu.: 0 1st Qu.: 1266
## Median :0.0000 Median : 6000 Median :147095
## Mean :0.0444 Mean : 7409 Mean :242237
## 3rd Qu.:0.0000 3rd Qu.:13500 3rd Qu.:385000
## Max. :2.0000 Max. :25000 Max. :999640
##
## AVG_AMT_PER_CHQ_TXN AVG_AMT_PER_NET_TXN AVG_AMT_PER_MOB_TXN
## Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 0 1st Qu.: 0 1st Qu.: 0
## Median : 8645 Median : 0 Median : 0
## Mean : 25093 Mean :179059 Mean : 20304
## 3rd Qu.: 28605 3rd Qu.:257699 3rd Qu.: 0
## Max. :537842 Max. :999854 Max. :199667
##
## FLG_HAS_NOMINEE FLG_HAS_OLD_LOAN random
## Min. :0.0000 Min. :0.0000 Min. :0.0000114
## 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:0.2481866
## Median :1.0000 Median :0.0000 Median :0.5061214
## Mean :0.9012 Mean :0.4929 Mean :0.5019330
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.7535712
## Max. :1.0000 Max. :1.0000 Max. :0.9999471
##
###selecting data and running query on data frame
banking_duplicate_Indicator <- sqldf('SELECT count(cust_id),CUST_ID FROM bankcampaigndata GROUP BY CUST_ID ORDER BY CUST_ID')
write.csv(banking_duplicate_Indicator, file = "banking_duplicate_Indicator.csv")
SQL Summary shows following details w.r.t data analysis
–Age Bracket 26-30 have highest no of credit transactions but age bracket 31-35 number of transaction is also very near to that number. Age Bracket 26-30 seems to withdraw higher amount of cash from ATM. Other Age Bracket peoples average cash withdrawal. –Credit Card Transaction seems to be most popular mode of transaction for all the Age Bracket. Debit Card Transaction mode is the next preferable mode of transaction and “Mobile Banking Debit Transaction” is the least preferred mode of transaction for all the age bracket. –Age Bracket “26-30”, “31-35” and “41-45” are the set of peoples having higher number of transaction –Age Bracket “<25” have almost same average cash withdrawal and average amount debited per cheque transaction –“Avg. Amt withdrawn per Cash Withdrawal Transaction” is highest and then “Avg. Amt debited per Net Transaction” is second highest –“Avg. Amt debited per Mobile Banking Transaction” is low for all age bracket and this is mainly due to lower number of “Mobile Banking Debit Transaction” –No of Transaction and Average cash withdrawal through different mode does not depend on whether the customer has old loan or not –Unexpectedly the customers having no credit card made much higher no of credit transaction and it is not clear how the customers who does not have CC have made 173423 no of credit transaction. This needs to be validated from Business Team and this may be some data issue. –“Avg. Amt debited per Mobile Banking Transaction” is low for all age bracket and this is mainly due to lower number of “Mobile Banking Debit Transaction” –Average Cash withdrawal through different mode (cheque, cash and Net) surprisingly seems to be much more for customers who does not hold credit card –Cash Transaction seems to be preferred mode across all the age bracket; Net Transaction seems to be 2nd most preferable and mobile transaction seems to be picking up and becoming popular similar to cheque transaction –Age Bracket “>50” seems to using the mobile banking most among all the age bracket
qqmysummary <- sqldf('select AGE_BKT, sum(NO_OF_L_CR_TXNS) "No. of Credit Transactions",sum(NO_OF_L_DR_TXNS) "No. of Debit Transactions", sum(NO_OF_ATM_DR_TXNS) "No. of ATM Debit Transactions",sum(NO_OF_NET_DR_TXNS) "No. of Net Debit Transactions",sum(NO_OF_MOB_DR_TXNS) "No. of Mobile Banking Debit Transactions",sum(NO_OF_CHQ_DR_TXNS) "No. of Cheque Debit Transactions", avg(AVG_AMT_PER_ATM_TXN) "Avg. Amt withdrawn per ATM Transaction", avg(AVG_AMT_PER_CSH_WDL_TXN) "Avg. Amt withdrawn per Cash Withdrawal Transaction",avg(AVG_AMT_PER_CHQ_TXN) "Avg. Amt debited per Cheque Transaction", avg(AVG_AMT_PER_NET_TXN) "Avg. Amt debited per Net Transaction", avg(AVG_AMT_PER_MOB_TXN) "Avg. Amt debited per Mobile Banking Transaction" FROM mydatac group by AGE_BKT ORDER by AGE_BKT')
write.csv(qqmysummary, file = "qqsumamry.csv")
# sqldf('SELECT count(cust_id),CUST_ID FROM bankcampaigndata GROUP BY CUST_ID ORDER BY CUST_ID')
# sqldf('SELECT distinct OCCUPATION FROM bankcampaigndata ')
mysummary <- sqldf('select AGE_BKT, sum(NO_OF_L_CR_TXNS) "No. of Credit Transactions",sum(NO_OF_L_DR_TXNS) "No. of Debit Transactions", sum(NO_OF_ATM_DR_TXNS) "No. of ATM Debit Transactions",sum(NO_OF_NET_DR_TXNS) "No. of Net Debit Transactions",sum(NO_OF_MOB_DR_TXNS) "No. of Mobile Banking Debit Transactions",sum(NO_OF_CHQ_DR_TXNS) "No. of Cheque Debit Transactions", sum(AVG_AMT_PER_ATM_TXN) "Avg. Amt withdrawn per ATM Transaction", sum(AVG_AMT_PER_CSH_WDL_TXN) "Avg. Amt withdrawn per Cash Withdrawal Transaction",sum(AVG_AMT_PER_CHQ_TXN) "Avg. Amt debited per Cheque Transaction", sum(AVG_AMT_PER_NET_TXN) "Avg. Amt debited per Net Transaction", sum(AVG_AMT_PER_MOB_TXN) "Avg. Amt debited per Mobile Banking Transaction" FROM mydatac group by AGE_BKT ORDER by AGE_BKT')
write.csv(mysummary, file = "sumamry.csv")
mynewsummary <- sqldf('select FLG_HAS_OLD_LOAN, sum(NO_OF_L_CR_TXNS) "No. of Credit Transactions",sum(NO_OF_L_DR_TXNS) "No. of Debit Transactions", sum(NO_OF_ATM_DR_TXNS) "No. of ATM Debit Transactions",sum(NO_OF_NET_DR_TXNS) "No. of Net Debit Transactions",sum(NO_OF_MOB_DR_TXNS) "No. of Mobile Banking Debit Transactions",sum(NO_OF_CHQ_DR_TXNS) "No. of Cheque Debit Transactions", sum(AVG_AMT_PER_ATM_TXN) "Avg. Amt withdrawn per ATM Transaction", sum(AVG_AMT_PER_CSH_WDL_TXN) "Avg. Amt withdrawn per Cash Withdrawal Transaction",sum(AVG_AMT_PER_CHQ_TXN) "Avg. Amt debited per Cheque Transaction", sum(AVG_AMT_PER_NET_TXN) "Avg. Amt debited per Net Transaction", sum(AVG_AMT_PER_MOB_TXN) "Avg. Amt debited per Mobile Banking Transaction" FROM mydatac group by FLG_HAS_OLD_LOAN')
write.csv(mynewsummary, file = "newsumamry.csv")
mynewccsummary <- sqldf('select FLG_HAS_CC, sum(NO_OF_L_CR_TXNS) "No. of Credit Transactions",sum(NO_OF_L_DR_TXNS) "No. of Debit Transactions", sum(NO_OF_ATM_DR_TXNS) "No. of ATM Debit Transactions",sum(NO_OF_NET_DR_TXNS) "No. of Net Debit Transactions",sum(NO_OF_MOB_DR_TXNS) "No. of Mobile Banking Debit Transactions",sum(NO_OF_CHQ_DR_TXNS) "No. of Cheque Debit Transactions", sum(AVG_AMT_PER_ATM_TXN) "Avg. Amt withdrawn per ATM Transaction", sum(AVG_AMT_PER_CSH_WDL_TXN) "Avg. Amt withdrawn per Cash Withdrawal Transaction",sum(AVG_AMT_PER_CHQ_TXN) "Avg. Amt debited per Cheque Transaction", sum(AVG_AMT_PER_NET_TXN) "Avg. Amt debited per Net Transaction", sum(AVG_AMT_PER_MOB_TXN) "Avg. Amt debited per Mobile Banking Transaction" FROM mydatac group by FLG_HAS_CC')
write.csv(mynewccsummary, file = "flagccnewsumamry.csv")
QuICK code to do histogram for all numeric variables from the data.. Customers having average age of 30 have registered maximum number of transaction in shared dataset Most of the transition having very minimal balance SCR 250 having maximum frequency Holding Period of 15 having maximum frequency No. of Credit Transactions 5-6 having higher frequency No. of Debit Transactions 5-6 having higher frequency Total No. of Transaction 10-15 having maximum frequency followed by closely 15-20 Transaction history shows that higher frequency
bankcampaigndata %>%
keep(is.numeric) %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~ key, scales = "free") +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## density diagram for all the variables
bankcampaigndata %>%
keep(is.numeric) %>% # Keep only numeric columns
gather() %>% # Convert to key-value pairs
ggplot(aes(value)) + # Plot the values
facet_wrap(~ key, scales = "free") + # In separate panels
geom_density() # as density
## density diagram for all the variables
bankcampaigndata %>%
keep(is.numeric) %>% # Keep only numeric columns
gather() %>% # Convert to key-value pairs
ggplot(aes(value)) + # Plot the values
facet_wrap(~ key, scales = "free") + # In separate panels
geom_bar()
## density diagram for all the variables
mycor <- ggcorr(bankcampaigndata)
## Warning in ggcorr(bankcampaigndata): data in column(s) 'CUST_ID', 'GENDER',
## 'OCCUPATION', 'ACC_TYPE', 'ACC_OP_DATE' are not numeric and were ignored
mycor
### Data Preparation & Hypothesis Statement Validatio
Scaling the variable here. Data values are in different scales and hence following features have been chosen for scaling. BALANCE AMT_ATM_DR AVG_AMT_PER_MOB_TXN AVG_AMT_PER_NET_TXN AVG_AMT_PER_CHQ_TXN AVG_AMT_PER_CSH_WDL_TXN AVG_AMT_PER_ATM_TXN AMT_L_DR AMT_MOB_DR AMT_NET_DR AMT_CHQ_DR AMT_BR_CSH_WDL_DR AMT_ATM_DR
bankcampaigndata$BALANCE <- scale(bankcampaigndata$BALANCE)
bankcampaigndata$AMT_ATM_DR <-scale(bankcampaigndata$AMT_ATM_DR)
bankcampaigndata$AVG_AMT_PER_MOB_TXN <-scale(bankcampaigndata$AVG_AMT_PER_MOB_TXN)
bankcampaigndata$AVG_AMT_PER_NET_TXN <-scale(bankcampaigndata$AVG_AMT_PER_NET_TXN)
bankcampaigndata$AVG_AMT_PER_CHQ_TXN <-scale(bankcampaigndata$AVG_AMT_PER_CHQ_TXN)
bankcampaigndata$AVG_AMT_PER_CSH_WDL_TXN <-scale(bankcampaigndata$AVG_AMT_PER_CSH_WDL_TXN)
bankcampaigndata$AVG_AMT_PER_ATM_TXN <-scale(bankcampaigndata$AVG_AMT_PER_ATM_TXN)
bankcampaigndata$AMT_L_DR <-scale(bankcampaigndata$AMT_L_DR)
bankcampaigndata$AMT_MOB_DR <-scale(bankcampaigndata$AMT_MOB_DR)
bankcampaigndata$AMT_NET_DR <-scale(bankcampaigndata$AMT_NET_DR)
bankcampaigndata$AMT_CHQ_DR <-scale(bankcampaigndata$AMT_CHQ_DR)
bankcampaigndata$AMT_BR_CSH_WDL_DR <-scale(bankcampaigndata$AMT_BR_CSH_WDL_DR)
bankcampaigndata$AMT_ATM_DR <-scale(bankcampaigndata$AMT_ATM_DR)
##mydatac$AMT_ATM_DR
Convert the factor variables into dummy variables. We need to drop AGE,GENDER,OCCUPATION and ACC_TYPE original columns as dummay variables for them already been created.
bankcampaigndata_analysis <- dummy.data.frame(bankcampaigndata[,-1], sep = ".")
str(bankcampaigndata_analysis)
## 'data.frame': 20000 obs. of 44 variables:
## $ TARGET : int 0 0 0 0 0 0 0 0 0 0 ...
## $ AGE : int 27 47 40 53 36 42 30 53 42 30 ...
## $ GENDER.F : int 0 0 0 0 0 1 0 1 1 0 ...
## $ GENDER.M : int 1 1 1 1 1 0 1 0 0 1 ...
## $ GENDER.O : int 0 0 0 0 0 0 0 0 0 0 ...
## $ BALANCE : num [1:20000, 1] -0.702 -0.309 -0.682 -0.608 1.603 ...
## ..- attr(*, "scaled:center")= num 511362
## ..- attr(*, "scaled:scale")= num 723593
## $ OCCUPATION.PROF : int 0 0 0 0 1 1 1 0 0 1 ...
## $ OCCUPATION.SAL : int 0 1 0 1 0 0 0 1 0 0 ...
## $ OCCUPATION.SELF-EMP : int 1 0 1 0 0 0 0 0 1 0 ...
## $ OCCUPATION.SENP : int 0 0 0 0 0 0 0 0 0 0 ...
## $ SCR : int 776 324 603 196 167 493 479 562 105 170 ...
## $ HOLDING_PERIOD : int 30 28 2 13 24 26 14 25 15 13 ...
## $ ACC_TYPE.CA : int 0 0 0 1 0 0 0 1 0 0 ...
## $ ACC_TYPE.SA : int 1 1 1 0 1 1 1 0 1 1 ...
## $ ACC_OP_DATE : Date, format: "2020-03-23" "2020-10-11" ...
## $ LEN_OF_RLTN_IN_MNTH : int 146 104 61 107 185 192 177 99 88 111 ...
## $ NO_OF_L_CR_TXNS : int 7 8 10 36 20 5 6 14 18 14 ...
## $ NO_OF_L_DR_TXNS : int 3 2 5 14 1 2 6 3 14 8 ...
## $ TOT_NO_OF_L_TXNS : int 10 10 15 50 21 7 12 17 32 22 ...
## $ NO_OF_BR_CSH_WDL_DR_TXNS: int 0 0 1 4 1 1 0 3 6 3 ...
## $ NO_OF_ATM_DR_TXNS : int 1 1 1 2 0 1 1 0 2 1 ...
## $ NO_OF_NET_DR_TXNS : int 2 1 1 3 0 0 1 0 4 0 ...
## $ NO_OF_MOB_DR_TXNS : int 0 0 0 1 0 0 0 0 1 0 ...
## $ NO_OF_CHQ_DR_TXNS : int 0 0 2 4 0 0 4 0 1 4 ...
## $ FLG_HAS_CC : int 0 0 0 0 0 1 0 0 1 0 ...
## $ AMT_ATM_DR : num [1:20000, 1] 0.1379 -0.2868 0.0137 0.9873 -0.7181 ...
## ..- attr(*, "scaled:center")= num 3.78e-17
## ..- attr(*, "scaled:scale")= num 1
## $ AMT_BR_CSH_WDL_DR : num [1:20000, 1] -1.142 -1.142 0.551 0.89 1.297 ...
## ..- attr(*, "scaled:center")= num 378474
## ..- attr(*, "scaled:scale")= num 331418
## $ AMT_CHQ_DR : num [1:20000, 1] -0.283 -0.283 -0.171 -0.145 -0.283 ...
## ..- attr(*, "scaled:center")= num 124520
## ..- attr(*, "scaled:scale")= num 439380
## $ AMT_NET_DR : num [1:20000, 1] 2.308 1.763 2.383 1.58 -0.744 ...
## ..- attr(*, "scaled:center")= num 237308
## ..- attr(*, "scaled:scale")= num 319068
## $ AMT_MOB_DR : num [1:20000, 1] -0.453 -0.453 -0.453 0.989 -0.453 ...
## ..- attr(*, "scaled:center")= num 22425
## ..- attr(*, "scaled:scale")= num 49485
## $ AMT_L_DR : num [1:20000, 1] 0.2988 0.0459 1.1864 1.1221 0.0488 ...
## ..- attr(*, "scaled:center")= num 773717
## ..- attr(*, "scaled:scale")= num 712659
## $ FLG_HAS_ANY_CHGS : int 0 1 1 0 0 0 1 0 0 0 ...
## $ AMT_OTH_BK_ATM_USG_CHGS : int 0 0 0 0 0 0 0 0 0 0 ...
## $ AMT_MIN_BAL_NMC_CHGS : int 0 0 0 0 0 0 0 0 0 0 ...
## $ NO_OF_IW_CHQ_BNC_TXNS : int 0 0 0 0 0 0 0 0 0 0 ...
## $ NO_OF_OW_CHQ_BNC_TXNS : int 0 0 1 0 0 0 0 0 0 0 ...
## $ AVG_AMT_PER_ATM_TXN : num [1:20000, 1] 0.789 -0.112 0.526 0.782 -1.028 ...
## ..- attr(*, "scaled:center")= num 7409
## ..- attr(*, "scaled:scale")= num 7210
## $ AVG_AMT_PER_CSH_WDL_TXN : num [1:20000, 1] -0.898 -0.898 1.182 -0.274 2.098 ...
## ..- attr(*, "scaled:center")= num 242236
## ..- attr(*, "scaled:scale")= num 269838
## $ AVG_AMT_PER_CHQ_TXN : num [1:20000, 1] -0.50808 -0.50808 -0.00876 -0.20041 -0.50808 ...
## ..- attr(*, "scaled:center")= num 25092
## ..- attr(*, "scaled:scale")= num 49387
## $ AVG_AMT_PER_NET_TXN : num [1:20000, 1] 1.092 2.204 2.906 0.242 -0.636 ...
## ..- attr(*, "scaled:center")= num 179059
## ..- attr(*, "scaled:scale")= num 281707
## $ AVG_AMT_PER_MOB_TXN : num [1:20000, 1] -0.437 -0.437 -0.437 1.099 -0.437 ...
## ..- attr(*, "scaled:center")= num 20304
## ..- attr(*, "scaled:scale")= num 46473
## $ FLG_HAS_NOMINEE : int 1 1 1 1 1 1 0 1 1 0 ...
## $ FLG_HAS_OLD_LOAN : int 1 0 1 0 0 1 1 1 1 0 ...
## $ random : num 1.14e-05 1.11e-04 1.20e-04 1.37e-04 1.74e-04 ...
## - attr(*, "dummies")=List of 3
## ..$ GENDER : int 3 4 5
## ..$ OCCUPATION: int 7 8 9 10
## ..$ ACC_TYPE : int 13 14
###changing the column name to meaningfull one
names(bankcampaigndata_analysis)[names(bankcampaigndata_analysis) == "OCCUPATION.PROF"] <- "OCCUPROF"
names(bankcampaigndata_analysis)[names(bankcampaigndata_analysis) == "OCCUPATION.SAL"] <- "OCCUSAL"
names(bankcampaigndata_analysis)[names(bankcampaigndata_analysis) == "OCCUPATION.SELF-EMP"] <- "OCCUSELFEMP"
names(bankcampaigndata_analysis)[names(bankcampaigndata_analysis) == "OCCUPATION.SENP"] <- "OCCUSENP"
### Split the data into test and training dataset.. Here we have assumed ratio as 75% and 25% between training and test dataset
MyIndex <- sample(1:nrow(bankcampaigndata_analysis), size= 0.25*nrow(bankcampaigndata_analysis))
bankcampaigndata_analysis_test <- bankcampaigndata_analysis[MyIndex,]
nrow(bankcampaigndata_analysis_test)
## [1] 5000
bankcampaigndata_analysis_training <- bankcampaigndata_analysis[-MyIndex,]
nrow(bankcampaigndata_analysis_training)
## [1] 15000
str(bankcampaigndata_analysis_training)
## 'data.frame': 15000 obs. of 44 variables:
## $ TARGET : int 0 0 0 0 0 0 0 0 0 0 ...
## $ AGE : int 27 40 53 36 42 30 53 42 30 43 ...
## $ GENDER.F : int 0 0 0 0 1 0 1 1 0 0 ...
## $ GENDER.M : int 1 1 1 1 0 1 0 0 1 1 ...
## $ GENDER.O : int 0 0 0 0 0 0 0 0 0 0 ...
## $ BALANCE : num [1:15000, 1] -0.702 -0.6815 -0.6076 1.6035 0.0143 ...
## $ OCCUPROF : int 0 0 0 1 1 1 0 0 1 1 ...
## $ OCCUSAL : int 0 0 1 0 0 0 1 0 0 0 ...
## $ OCCUSELFEMP : int 1 1 0 0 0 0 0 1 0 0 ...
## $ OCCUSENP : int 0 0 0 0 0 0 0 0 0 0 ...
## $ SCR : int 776 603 196 167 493 479 562 105 170 164 ...
## $ HOLDING_PERIOD : int 30 2 13 24 26 14 25 15 13 24 ...
## $ ACC_TYPE.CA : int 0 0 1 0 0 0 1 0 0 1 ...
## $ ACC_TYPE.SA : int 1 1 0 1 1 1 0 1 1 0 ...
## $ ACC_OP_DATE : Date, format: "2020-03-23" "2020-04-26" ...
## $ LEN_OF_RLTN_IN_MNTH : int 146 61 107 185 192 177 99 88 111 183 ...
## $ NO_OF_L_CR_TXNS : int 7 10 36 20 5 6 14 18 14 4 ...
## $ NO_OF_L_DR_TXNS : int 3 5 14 1 2 6 3 14 8 4 ...
## $ TOT_NO_OF_L_TXNS : int 10 15 50 21 7 12 17 32 22 8 ...
## $ NO_OF_BR_CSH_WDL_DR_TXNS: int 0 1 4 1 1 0 3 6 3 2 ...
## $ NO_OF_ATM_DR_TXNS : int 1 1 2 0 1 1 0 2 1 1 ...
## $ NO_OF_NET_DR_TXNS : int 2 1 3 0 0 1 0 4 0 0 ...
## $ NO_OF_MOB_DR_TXNS : int 0 0 1 0 0 0 0 1 0 1 ...
## $ NO_OF_CHQ_DR_TXNS : int 0 2 4 0 0 4 0 1 4 0 ...
## $ FLG_HAS_CC : int 0 0 0 0 1 0 0 1 0 1 ...
## $ AMT_ATM_DR : num [1:15000, 1] 0.1379 0.0137 0.9873 -0.7181 0.4907 ...
## $ AMT_BR_CSH_WDL_DR : num [1:15000, 1] -1.14199 0.5511 0.89046 1.29747 0.00252 ...
## $ AMT_CHQ_DR : num [1:15000, 1] -0.283 -0.171 -0.145 -0.283 -0.283 ...
## $ AMT_NET_DR : num [1:15000, 1] 2.308 2.383 1.58 -0.744 -0.744 ...
## $ AMT_MOB_DR : num [1:15000, 1] -0.453 -0.453 0.989 -0.453 -0.453 ...
## $ AMT_L_DR : num [1:15000, 1] 0.2988 1.1864 1.1221 0.0488 -0.5275 ...
## $ FLG_HAS_ANY_CHGS : int 0 1 0 0 0 1 0 0 0 0 ...
## $ AMT_OTH_BK_ATM_USG_CHGS : int 0 0 0 0 0 0 0 0 0 0 ...
## $ AMT_MIN_BAL_NMC_CHGS : int 0 0 0 0 0 0 0 0 0 0 ...
## $ NO_OF_IW_CHQ_BNC_TXNS : int 0 0 0 0 0 0 0 0 0 0 ...
## $ NO_OF_OW_CHQ_BNC_TXNS : int 0 1 0 0 0 0 0 0 0 0 ...
## $ AVG_AMT_PER_ATM_TXN : num [1:15000, 1] 0.789 0.526 0.782 -1.028 1.538 ...
## $ AVG_AMT_PER_CSH_WDL_TXN : num [1:15000, 1] -0.898 1.182 -0.274 2.098 0.508 ...
## $ AVG_AMT_PER_CHQ_TXN : num [1:15000, 1] -0.50808 -0.00876 -0.20041 -0.50808 -0.50808 ...
## $ AVG_AMT_PER_NET_TXN : num [1:15000, 1] 1.092 2.906 0.242 -0.636 -0.636 ...
## $ AVG_AMT_PER_MOB_TXN : num [1:15000, 1] -0.437 -0.437 1.099 -0.437 -0.437 ...
## $ FLG_HAS_NOMINEE : int 1 1 1 1 1 0 1 1 0 1 ...
## $ FLG_HAS_OLD_LOAN : int 1 1 0 0 1 1 1 1 0 0 ...
## $ random : num 1.14e-05 1.20e-04 1.37e-04 1.74e-04 4.06e-04 ...
## - attr(*, "dummies")=List of 3
## ..$ GENDER : int 3 4 5
## ..$ OCCUPATION: int 7 8 9 10
## ..$ ACC_TYPE : int 13 14
Doing the Anova for our hypothesis. A statistical hypothesis is an assumption about population parameters. This assumption may or may not be true. Hypothesis testing refers to the formal procedures used by statisticians to accept or reject statistical hypotheses.
Null Hypothesis: there is no relationship between all these input variables and new cases of target response. Alternative Hypothesis: there is a relationship between target response and new cases target response
Our Hypothesis acceptance criteria will be based on p-value with 95% confidence interval. So input variables having p-value less than 0.05 will be accepted to have statistically significant impact on predictor variable.The objective of the ANOVA test is to analyze if there is a (statistically) significant difference in Target, between different other input variables.
Random sample has been taken from the available dataset and anova has been executed with all input variables.Input variables having p-value < 0.05 and not have significant impact on response variable Target. –AMT_CHQ_DR –AMT_OTH_BK_ATM_USG_CHGS –NO_OF_IW_CHQ_BNC_TXNS –NO_OF_OW_CHQ_BNC_TXNS –AVG_AMNT_PER_CHQ_TXN
bankcampaigndata.aov <- aov(TARGET ~., bankcampaigndata_analysis_training)
summary(bankcampaigndata.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## AGE 1 1.3 1.29 13.020 0.000309 ***
## GENDER.F 1 3.5 3.50 35.284 2.91e-09 ***
## GENDER.M 1 0.4 0.36 3.583 0.058384 .
## BALANCE 1 6.4 6.36 64.016 1.32e-15 ***
## OCCUPROF 1 0.2 0.17 1.678 0.195164
## OCCUSAL 1 9.3 9.27 93.318 < 2e-16 ***
## OCCUSELFEMP 1 13.0 13.01 131.008 < 2e-16 ***
## SCR 1 8.5 8.54 86.046 < 2e-16 ***
## HOLDING_PERIOD 1 30.0 30.04 302.490 < 2e-16 ***
## ACC_TYPE.CA 1 6.8 6.83 68.794 < 2e-16 ***
## ACC_OP_DATE 1 0.1 0.15 1.463 0.226456
## LEN_OF_RLTN_IN_MNTH 1 0.3 0.29 2.888 0.089259 .
## NO_OF_L_CR_TXNS 1 11.8 11.76 118.479 < 2e-16 ***
## NO_OF_L_DR_TXNS 1 0.2 0.23 2.286 0.130528
## TOT_NO_OF_L_TXNS 1 47.0 46.97 473.048 < 2e-16 ***
## NO_OF_BR_CSH_WDL_DR_TXNS 1 1.9 1.89 19.004 1.31e-05 ***
## NO_OF_ATM_DR_TXNS 1 0.2 0.18 1.800 0.179784
## NO_OF_NET_DR_TXNS 1 0.2 0.20 2.043 0.152917
## NO_OF_MOB_DR_TXNS 1 0.7 0.72 7.208 0.007264 **
## FLG_HAS_CC 1 14.1 14.08 141.781 < 2e-16 ***
## AMT_ATM_DR 1 0.4 0.43 4.370 0.036602 *
## AMT_BR_CSH_WDL_DR 1 0.7 0.68 6.833 0.008960 **
## AMT_CHQ_DR 1 0.1 0.07 0.751 0.386280
## AMT_NET_DR 1 0.4 0.39 3.947 0.046971 *
## AMT_MOB_DR 1 0.1 0.09 0.892 0.344953
## FLG_HAS_ANY_CHGS 1 1.1 1.13 11.346 0.000758 ***
## AMT_OTH_BK_ATM_USG_CHGS 1 0.0 0.01 0.099 0.753266
## AMT_MIN_BAL_NMC_CHGS 1 0.3 0.27 2.673 0.102061
## NO_OF_IW_CHQ_BNC_TXNS 1 0.0 0.00 0.003 0.954476
## NO_OF_OW_CHQ_BNC_TXNS 1 0.0 0.00 0.000 0.990425
## AVG_AMT_PER_ATM_TXN 1 0.7 0.72 7.239 0.007140 **
## AVG_AMT_PER_CSH_WDL_TXN 1 0.1 0.11 1.136 0.286592
## AVG_AMT_PER_CHQ_TXN 1 0.2 0.24 2.382 0.122788
## AVG_AMT_PER_NET_TXN 1 0.3 0.28 2.802 0.094147 .
## AVG_AMT_PER_MOB_TXN 1 0.1 0.09 0.905 0.341586
## FLG_HAS_NOMINEE 1 0.4 0.41 4.156 0.041502 *
## FLG_HAS_OLD_LOAN 1 0.3 0.32 3.226 0.072510 .
## random 1 0.0 0.02 0.237 0.626485
## Residuals 14961 1485.5 0.10
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
First Dry Run of random forest is run with following parameter and we have plotted the random forest object w.r.t error. ***Optimal no of Tree found from the following plot is selected as 40 where error has been stabilized
ntree : number of trees to grow; 80; mtry : number of variables selected at each split, By default, mtry = floor of (sqrt of number of independent variables) for classification model. But here we have taken as 80; nodesize : minimum size of terminal nodes; we have taken value of 7
mybankRF <- randomForest(TARGET ~., data= bankcampaigndata_analysis_training, ntree=80, mtry=3, nodesize=7,importance=TRUE)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
print(mybankRF)
##
## Call:
## randomForest(formula = TARGET ~ ., data = bankcampaigndata_analysis_training, ntree = 80, mtry = 3, nodesize = 7, importance = TRUE)
## Type of random forest: regression
## Number of trees: 80
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 0.04645676
## % Var explained: 57.68
plot(mybankRF, main="Random Forest Tree Optimal Diagram",col = "dark red")
legend("topright", c("OOB","0","1"), text.col = 1:6,lty = 1:3,col = 1:3) ## So our optimium no of tree 20
Important Variable detection is being done here. The extractor function for variable importance measures as produced by randomForest and then we have sorted by Gini Index. ****Note: All the variables up to GiniIndex value of 20 being considered as key variables Key Variables for Model BALANCE SCR(Generic Marketing Score) AMT_L_DR(Total Amount Debited) ACC_OP_DATE(Account Opening Date) HOLDING_PERIOD(Ability to hold money in the account) LEN_OF_RLTN_IN_MNTH(Length of Relationship in Months) TOT_NO_OF_L_TXNS(Total No. of Transaction) NO_OF_L_CR_TXNS(No. of Credit Transactions) AGE AMT_BR_CSH_WDL_DR(Amount cash withdrawn from Branch) AVG_AMT_PER_CSH_WDL_TXN(Avg. Amt withdrawn per Cash Withdrawal Transaction) AMT_ATM_DR(Amount Withdrawn from ATM) AVG_AMT_PER_ATM_TXN(Avg. Amt withdrawn per ATM Transaction AVG_AMT_PER_CHQ_TXN(Avg. Amt debited per Cheque Transaction AMT_CHQ_DR(Amount debited by Cheque Transactions) AMT_NET_DR(Amount debited by Net Transactions) AVG_AMT_PER_NET_TXN(Avg. Amt debited per Net Transaction) NO_OF_L_DR_TXNS(No. of Debit Transactions) NO_OF_BR_CSH_WDL_DR_TXNS(No. of Branch Cash Withdrawal Transactions) NO_OF_CHQ_DR_TXNS(No. of Cheque Debit Transactions) AMT_MOB_DR(Amount debited by Mobile Banking Transactions) AVG_AMT_PER_MOB_TXN(Avg. Amt debited per Mobile Banking Transaction) OCCUSELFEMP(Employed as profession FEMP)
This measure is the total decrease in node impurities from splitting on the variable, averaged over all trees. For classification, the node impurity is measured by the Gini index. For regression, it is measured by residual sum of squares.
impvar <- round(randomForest::importance(mybankRF),2)
impvar[order(impvar[,2],decreasing = TRUE),] ### Balance and SCR are most
## %IncMSE IncNodePurity
## SCR 15.75 76.82
## BALANCE 20.66 75.85
## ACC_OP_DATE 18.55 69.24
## AMT_L_DR 15.57 65.67
## LEN_OF_RLTN_IN_MNTH 20.08 65.44
## HOLDING_PERIOD 15.49 64.16
## AGE 20.82 58.07
## TOT_NO_OF_L_TXNS 17.24 57.92
## NO_OF_L_CR_TXNS 15.80 56.74
## AMT_BR_CSH_WDL_DR 15.23 56.03
## AVG_AMT_PER_CSH_WDL_TXN 16.36 54.44
## AVG_AMT_PER_ATM_TXN 13.23 54.33
## AMT_ATM_DR 9.81 52.67
## AMT_CHQ_DR 13.53 50.13
## AVG_AMT_PER_CHQ_TXN 18.68 47.78
## AMT_NET_DR 8.34 40.76
## random -0.63 39.56
## AVG_AMT_PER_NET_TXN 6.96 39.37
## NO_OF_L_DR_TXNS 9.77 35.98
## NO_OF_BR_CSH_WDL_DR_TXNS 11.93 28.79
## NO_OF_CHQ_DR_TXNS 10.15 25.93
## AMT_MOB_DR 8.42 23.07
## AVG_AMT_PER_MOB_TXN 9.26 22.21
## OCCUSELFEMP 13.72 21.48
## FLG_HAS_CC 14.91 18.90
## NO_OF_ATM_DR_TXNS 7.54 17.30
## NO_OF_NET_DR_TXNS 8.51 15.30
## OCCUSAL 10.77 12.97
## FLG_HAS_OLD_LOAN 12.79 11.67
## OCCUSENP 9.60 9.50
## GENDER.F 6.28 9.22
## GENDER.M 6.49 9.12
## FLG_HAS_ANY_CHGS 11.35 8.79
## OCCUPROF 8.75 8.72
## ACC_TYPE.SA 8.70 7.56
## ACC_TYPE.CA 7.19 7.24
## FLG_HAS_NOMINEE 12.04 6.50
## NO_OF_MOB_DR_TXNS 5.67 6.33
## NO_OF_OW_CHQ_BNC_TXNS 8.39 5.08
## NO_OF_IW_CHQ_BNC_TXNS 8.07 4.87
## GENDER.O 5.40 2.52
## AMT_MIN_BAL_NMC_CHGS 4.65 1.19
## AMT_OTH_BK_ATM_USG_CHGS 3.34 0.93
varImpPlot(mybankRF) ## PLOTTING THE VI
Finding Optimal mtry by the tuneRF function. ***Optimal no of mtry found from the following plot is selected as 13 where OOB error has been minimized and then it has started increasing again.
mybankTuneRF <- tuneRF(x=bankcampaigndata_analysis_training[,-1], ### so optimised mtry is 5
y=bankcampaigndata_analysis_training$TARGET,
mtryStart= 3,
ntreeTry=20,
stepFactor=1.5,
improve=0.0001,
trace=TRUE,
plot = TRUE,
doBest = TRUE,
nodesize=10,
importance=TRUE
)
## Warning in randomForest.default(x, y, mtry = mtryStart, ntree = ntreeTry, :
## The response has five or fewer unique values. Are you sure you want to do
## regression?
## mtry = 3 OOB error = 0.05601554
## Searching left ...
## Warning in randomForest.default(x, y, mtry = mtryCur, ntree = ntreeTry, :
## The response has five or fewer unique values. Are you sure you want to do
## regression?
## mtry = 2 OOB error = 0.06335749
## -0.1310699 1e-04
## Searching right ...
## Warning in randomForest.default(x, y, mtry = mtryCur, ntree = ntreeTry, :
## The response has five or fewer unique values. Are you sure you want to do
## regression?
## mtry = 4 OOB error = 0.05381888
## 0.03921525 1e-04
## Warning in randomForest.default(x, y, mtry = mtryCur, ntree = ntreeTry, :
## The response has five or fewer unique values. Are you sure you want to do
## regression?
## mtry = 6 OOB error = 0.05187122
## 0.03618905 1e-04
## Warning in randomForest.default(x, y, mtry = mtryCur, ntree = ntreeTry, :
## The response has five or fewer unique values. Are you sure you want to do
## regression?
## mtry = 9 OOB error = 0.05032635
## 0.02978296 1e-04
## Warning in randomForest.default(x, y, mtry = mtryCur, ntree = ntreeTry, :
## The response has five or fewer unique values. Are you sure you want to do
## regression?
## mtry = 13 OOB error = 0.04851986
## 0.03589536 1e-04
## Warning in randomForest.default(x, y, mtry = mtryCur, ntree = ntreeTry, :
## The response has five or fewer unique values. Are you sure you want to do
## regression?
## mtry = 19 OOB error = 0.04926101
## -0.0152751 1e-04
## Warning in randomForest.default(x, y, mtry = res[which.min(res[, 2]), 1], :
## The response has five or fewer unique values. Are you sure you want to do
## regression?
Now we have Important variable, Optimal no of Tree and mtry, so new random forest object has to be generated with these revised values.
mybankVIFinalRF <- randomForest(TARGET ~ BALANCE+ SCR+AMT_L_DR+ACC_OP_DATE+HOLDING_PERIOD+LEN_OF_RLTN_IN_MNTH+TOT_NO_OF_L_TXNS+NO_OF_L_CR_TXNS+AGE+AMT_BR_CSH_WDL_DR+AVG_AMT_PER_CSH_WDL_TXN+AMT_ATM_DR+AVG_AMT_PER_ATM_TXN+AVG_AMT_PER_CHQ_TXN+AMT_CHQ_DR+AMT_NET_DR+AVG_AMT_PER_NET_TXN+NO_OF_L_DR_TXNS+NO_OF_BR_CSH_WDL_DR_TXNS+NO_OF_CHQ_DR_TXNS+AMT_MOB_DR+AVG_AMT_PER_MOB_TXN+OCCUSELFEMP , data= bankcampaigndata_analysis_training, ntree=40, mtry=13, nodesize=7,importance=TRUE)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
print(mybankVIFinalRF)
##
## Call:
## randomForest(formula = TARGET ~ BALANCE + SCR + AMT_L_DR + ACC_OP_DATE + HOLDING_PERIOD + LEN_OF_RLTN_IN_MNTH + TOT_NO_OF_L_TXNS + NO_OF_L_CR_TXNS + AGE + AMT_BR_CSH_WDL_DR + AVG_AMT_PER_CSH_WDL_TXN + AMT_ATM_DR + AVG_AMT_PER_ATM_TXN + AVG_AMT_PER_CHQ_TXN + AMT_CHQ_DR + AMT_NET_DR + AVG_AMT_PER_NET_TXN + NO_OF_L_DR_TXNS + NO_OF_BR_CSH_WDL_DR_TXNS + NO_OF_CHQ_DR_TXNS + AMT_MOB_DR + AVG_AMT_PER_MOB_TXN + OCCUSELFEMP, data = bankcampaigndata_analysis_training, ntree = 40, mtry = 13, nodesize = 7, importance = TRUE)
## Type of random forest: regression
## Number of trees: 40
## No. of variables tried at each split: 13
##
## Mean of squared residuals: 0.04090712
## % Var explained: 62.74
plot(mybankVIFinalRF, main="Vairable of Importance Random Forest Tree Final Optimal Diagram")
print("Tree Sizes are")
## [1] "Tree Sizes are"
treesize(mybankVIFinalRF)
## [1] 2652 2496 2479 2485 2552 2423 2593 2550 2491 2376 2524 2596 2405 2333
## [15] 2576 2379 2576 2361 2534 2324 2555 2624 2391 2657 2423 2591 2282 2524
## [29] 2471 2346 2432 2465 2529 2612 2410 2594 2613 2511 2421 2626
Confusion Matrix is one aspect of Model Performance measurement. Measurement on Training Dataset: Input data is biased towards Target 0 and hence the accuracy % will not be correct measurement to indicate model effectiveness. Key criteria of Model Performance measurement for such biased data is “Neg Pred Value” which indicates % of “Negative Class” predicted correctly and correct predicted value of 1 is 96.99% Measurement on Test Dataset: Key criteria of Model Performance measurement for such biased data is “Neg Pred Value” which indicates % of “Negative Class” predicted correctly and correct predicted value of 1 is 72.04%
bankcampaigndata_analysis_training$predictVITargetProb <- predict(mybankVIFinalRF,bankcampaigndata_analysis_training, type='response')
bankcampaigndata_analysis_training$predictVITarget <- ifelse(bankcampaigndata_analysis_training$predictVITargetProb <0.5,0,1)
mycamaigntVIable <- table(actualclass=bankcampaigndata_analysis_training$TARGET, predictedclass=bankcampaigndata_analysis_training$predictVITarget)
mycampaignconfusionVImatrix <- confusionMatrix(mycamaigntVIable)
print(mycampaignconfusionVImatrix)
## Confusion Matrix and Statistics
##
## predictedclass
## actualclass 0 1
## 0 13117 0
## 1 54 1829
##
## Accuracy : 0.9964
## 95% CI : (0.9953, 0.9973)
## No Information Rate : 0.8781
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9834
## Mcnemar's Test P-Value : 5.498e-13
##
## Sensitivity : 0.9959
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9713
## Prevalence : 0.8781
## Detection Rate : 0.8745
## Detection Prevalence : 0.8745
## Balanced Accuracy : 0.9980
##
## 'Positive' Class : 0
##
bankcampaigndata_analysis_test$predictVITargetProb <- predict(mybankVIFinalRF,bankcampaigndata_analysis_test, type='response')
bankcampaigndata_analysis_test$predictVITarget <- ifelse(bankcampaigndata_analysis_test$predictVITargetProb <0.5,0,1)
mycamaigntableVITest <- table(actualclass=bankcampaigndata_analysis_test$TARGET, predictedclass=bankcampaigndata_analysis_test$predictVITarget)
mycampaignconfusionmatrixVITest <- confusionMatrix(mycamaigntableVITest) ### Neg Pred Value is quite less here
print(mycampaignconfusionmatrixVITest)
## Confusion Matrix and Statistics
##
## predictedclass
## actualclass 0 1
## 0 4356 15
## 1 183 446
##
## Accuracy : 0.9604
## 95% CI : (0.9546, 0.9656)
## No Information Rate : 0.9078
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7967
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9597
## Specificity : 0.9675
## Pos Pred Value : 0.9966
## Neg Pred Value : 0.7091
## Prevalence : 0.9078
## Detection Rate : 0.8712
## Detection Prevalence : 0.8742
## Balanced Accuracy : 0.9636
##
## 'Positive' Class : 0
##
Gain & Lift Chart if another aspect of Model Performance measurement. Lift and Gain Charts are a useful way of visualizing how good a predictive model is Measurement on Test Dataset: Cumulative total response rate is 69.1% for the target where response is received
bankcampaigndata_analysis_test$kpi <- predict(mybankVIFinalRF,bankcampaigndata_analysis_test, type='response')
newdata<- bankcampaigndata_analysis_test[
with(bankcampaigndata_analysis_test, order(-kpi)),
]
mydecile <- decile(newdata$kpi)
mygain <- gains(bankcampaigndata_analysis_test$TARGET, bankcampaigndata_analysis_test$predictVITarget,groups=10)
## Warning in gains(bankcampaigndata_analysis_test$TARGET,
## bankcampaigndata_analysis_test$predictVITarget, : Warning: Fewer distinct
## predicted values than groups requested
print(mygain)
## Depth Cume Cume Pct Mean
## of Cume Mean Mean of Total Lift Cume Model
## File N N Resp Resp Resp Index Lift Score
## -------------------------------------------------------------------------
## 9 461 461 0.97 0.97 70.9% 769 769 1.00
## 100 4539 5000 0.04 0.13 100.0% 32 100 0.00
plot(mygain)
### Conclusion & Recommendation Rank Order table on the test data set shows that we should target all the customers who are in decile 10,9 and 8. This will help us to achieve response rate of 93% response rate. Beyond this, the increase in cumulative response is becoming flat and marketing cost increase will not be able to justify the response % increase.
bankcampaigndata_analysis_test$deciles <- decile(bankcampaigndata_analysis_test$predictVITargetProb)
library(data.table)
tmp_DT = data.table(bankcampaigndata_analysis_test)
h_rank <- tmp_DT[, list(
cnt = length(TARGET),
cnt_resp = sum(TARGET),
cnt_non_resp = sum(TARGET == 0)) ,
by=deciles][order(-deciles)]
h_rank$rrate <- round (h_rank$cnt_resp / h_rank$cnt,2);
h_rank$cum_resp <- cumsum(h_rank$cnt_resp)
h_rank$cum_non_resp <- cumsum(h_rank$cnt_non_resp)
h_rank$cum_rel_resp <- round(h_rank$cum_resp / sum(h_rank$cnt_resp),2);
h_rank$cum_rel_non_resp <- round(h_rank$cum_non_resp / sum(h_rank$cnt_non_resp),2);
h_rank$ks <- abs(h_rank$cum_rel_resp - h_rank$cum_rel_non_resp);
View(h_rank)