library(ggcorrplot) : Correlation Matrix library(tidyverse) : Data cleaning library(caret) : Data Cleaning library(knitr) : Create table comparison library(caTools) : Fast calculation of AUC, LogitBoost Classifier library(DMwR) : Data Mining library(xgboost) : Extreme Gradient Boosting Algorithm library(ROCR) : AUC Curve
library(ggcorrplot)
## Loading required package: ggplot2
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble 3.0.4 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.0
## v purrr 0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(knitr)
library(caTools)
library(DMwR)
## Loading required package: grid
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(e1071)
library(xgboost)
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
library(tinytex)
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(ROCR)
#set the working directory
setwd("C:/Users/lenovo/Documents/R/Practice/RHB")
#Import the data
BankData <- read.csv(file = 'Data Analysis Skill.csv')
head(BankData)
## CIF_KEY No Age Gender Marital.Status Premier.Customer..1..Yes.
## 1 2092996 1 71 UNKNOWN Single 1
## 2 11934718 2 68 UNKNOWN Married 0
## 3 2327694 3 65 UNKNOWN Married 1
## 4 10556315 4 60 UNKNOWN Married 0
## 5 6136151 5 71 UNKNOWN Single 0
## 6 2239881 6 46 UNKNOWN Married 0
## State Race Occupation Salary All.Deposit.Acc.Balance
## 1 01. KLANG VALLEY CHINESE White Collar 4000.00 406,936.3
## 2 01. KLANG VALLEY CHINESE Others 0.00 346,415.2
## 3 01. KLANG VALLEY CHINESE White Collar 6200.00 325,803.6
## 4 01. KLANG VALLEY CHINESE Unknown 0.00 159,000.0
## 5 01. KLANG VALLEY INDIAN White Collar 0.00 137,410.2
## 6 08. PULAU PINANG CHINESE Self Employed 17171.12 119,802.4
## Count.of.Deposit.Acc Count.of.Product.with.Bank Credit.Card.Balance
## 1 10 2 NA
## 2 9 2 NA
## 3 2 4 10043.92
## 4 8 2 NA
## 5 4 3 NA
## 6 6 4 1969.01
## Count.of.Credit.Card Fixed.Deposit.Balance Housing.Loan.Balance
## 1 NA 383800.00 NA
## 2 NA 340000.00 NA
## 3 2 NA NA
## 4 NA 159000.00 NA
## 5 NA 107000.00 NA
## 6 1 86398.08 NA
## Savings.Acc.Balance Internet.Bank.User..1.Yes. Latest.Product.Brough.Date
## 1 23136.25 0 14/06/2016
## 2 6415.18 0 05/05/2016
## 3 NA 0 18/04/2016
## 4 NA 0 09/01/2017
## 5 26788.51 0 04/02/2016
## 6 33404.33 1 12/10/2016
## Relation.with.Bank..in.month. New.Acct.open.last.6.mths..Count.
## 1 314 NA
## 2 684 NA
## 3 209 NA
## 4 55 4
## 5 212 NA
## 6 357 1
## Internet.Bank.Tranx.last.6.mth..count. Total.Asset.Under.Management..AUM.
## 1 NA 406,936
## 2 NA 346,415
## 3 NA 325,808
## 4 NA 159,000
## 5 NA 137,410
## 6 171 119,802
## Total.Loan AUM.Movement.Slope.last.6.mths
## 1 NA -1478.0774
## 2 NA -806.9297
## 3 NA -2818.1753
## 4 NA -2142.8571
## 5 NA 272.3869
## 6 NA -5846.2591
#The data is converted to dataframe
class(BankData)
## [1] "data.frame"
str(BankData)
## 'data.frame': 100 obs. of 26 variables:
## $ CIF_KEY : int 2092996 11934718 2327694 10556315 6136151 2239881 1310026 2176385 450425 9369079 ...
## $ No : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Age : int 71 68 65 60 71 46 69 72 77 28 ...
## $ Gender : chr "UNKNOWN" "UNKNOWN" "UNKNOWN" "UNKNOWN" ...
## $ Marital.Status : chr "Single" "Married" "Married" "Married" ...
## $ Premier.Customer..1..Yes. : int 1 0 1 0 0 0 0 0 0 0 ...
## $ State : chr "01. KLANG VALLEY" "01. KLANG VALLEY" "01. KLANG VALLEY" "01. KLANG VALLEY" ...
## $ Race : chr "CHINESE" "CHINESE" "CHINESE" "CHINESE" ...
## $ Occupation : chr "White Collar" "Others" "White Collar" "Unknown" ...
## $ Salary : num 4000 0 6200 0 0 ...
## $ All.Deposit.Acc.Balance : chr " 406,936.3 " " 346,415.2 " " 325,803.6 " " 159,000.0 " ...
## $ Count.of.Deposit.Acc : int 10 9 2 8 4 6 5 4 3 4 ...
## $ Count.of.Product.with.Bank : int 2 2 4 2 3 4 2 7 1 3 ...
## $ Credit.Card.Balance : num NA NA 10044 NA NA ...
## $ Count.of.Credit.Card : int NA NA 2 NA NA 1 NA 1 NA NA ...
## $ Fixed.Deposit.Balance : num 383800 340000 NA 159000 107000 ...
## $ Housing.Loan.Balance : num NA NA NA NA NA NA NA NA NA NA ...
## $ Savings.Acc.Balance : num 23136 6415 NA NA 26789 ...
## $ Internet.Bank.User..1.Yes. : int 0 0 0 0 0 1 0 1 0 1 ...
## $ Latest.Product.Brough.Date : chr "14/06/2016" "05/05/2016" "18/04/2016" "09/01/2017" ...
## $ Relation.with.Bank..in.month. : int 314 684 209 55 212 357 218 261 222 84 ...
## $ New.Acct.open.last.6.mths..Count. : int NA NA NA 4 NA 1 NA NA NA NA ...
## $ Internet.Bank.Tranx.last.6.mth..count.: int NA NA NA NA NA 171 NA NA NA 11 ...
## $ Total.Asset.Under.Management..AUM. : chr " 406,936 " " 346,415 " " 325,808 " " 159,000 " ...
## $ Total.Loan : num NA NA NA NA NA NA NA NA NA NA ...
## $ AUM.Movement.Slope.last.6.mths : num -1478 -807 -2818 -2143 272 ...
summary(BankData)
## CIF_KEY No Age Gender
## Min. : 450425 Min. : 1.00 Min. : 5.00 Length:100
## 1st Qu.: 3276540 1st Qu.: 25.75 1st Qu.:33.00 Class :character
## Median : 7835438 Median : 50.50 Median :47.00 Mode :character
## Mean : 7376156 Mean : 50.50 Mean :47.38
## 3rd Qu.:11178092 3rd Qu.: 75.25 3rd Qu.:61.00
## Max. :13239950 Max. :100.00 Max. :87.00
##
## Marital.Status Premier.Customer..1..Yes. State
## Length:100 Min. :0.00 Length:100
## Class :character 1st Qu.:0.00 Class :character
## Mode :character Median :0.00 Mode :character
## Mean :0.04
## 3rd Qu.:0.00
## Max. :1.00
##
## Race Occupation Salary All.Deposit.Acc.Balance
## Length:100 Length:100 Min. : 0 Length:100
## Class :character Class :character 1st Qu.: 0 Class :character
## Mode :character Mode :character Median : 0 Mode :character
## Mean : 2147
## 3rd Qu.: 3855
## Max. :17171
## NA's :1
## Count.of.Deposit.Acc Count.of.Product.with.Bank Credit.Card.Balance
## Min. : 1 Min. :1.00 Min. : -0.1
## 1st Qu.: 1 1st Qu.:2.00 1st Qu.: 281.9
## Median : 1 Median :2.00 Median : 1969.0
## Mean : 2 Mean :2.36 Mean : 2929.6
## 3rd Qu.: 2 3rd Qu.:3.00 3rd Qu.: 4079.6
## Max. :10 Max. :7.00 Max. :10043.9
## NA's :91
## Count.of.Credit.Card Fixed.Deposit.Balance Housing.Loan.Balance
## Min. :1.000 Min. : 1046 Min. : 8175
## 1st Qu.:1.000 1st Qu.: 17201 1st Qu.: 116734
## Median :1.000 Median : 38820 Median : 277737
## Mean :1.182 Mean : 71819 Mean : 391881
## 3rd Qu.:1.000 3rd Qu.: 89430 3rd Qu.: 316471
## Max. :2.000 Max. :383800 Max. :1757071
## NA's :89 NA's :70 NA's :91
## Savings.Acc.Balance Internet.Bank.User..1.Yes. Latest.Product.Brough.Date
## Min. : 5.47 Min. :0.00 Length:100
## 1st Qu.: 2188.96 1st Qu.:0.00 Class :character
## Median : 4285.59 Median :0.00 Mode :character
## Mean : 8612.27 Mean :0.49
## 3rd Qu.: 9736.36 3rd Qu.:1.00
## Max. :67951.19 Max. :1.00
## NA's :14
## Relation.with.Bank..in.month. New.Acct.open.last.6.mths..Count.
## Min. : 2.00 Min. :1.000
## 1st Qu.: 44.75 1st Qu.:1.000
## Median :130.00 Median :1.000
## Mean :171.68 Mean :1.667
## 3rd Qu.:261.50 3rd Qu.:2.000
## Max. :684.00 Max. :4.000
## NA's :61
## Internet.Bank.Tranx.last.6.mth..count. Total.Asset.Under.Management..AUM.
## Min. : 0.00 Length:100
## 1st Qu.: 0.00 Class :character
## Median : 2.00 Mode :character
## Mean : 14.96
## 3rd Qu.: 14.00
## Max. :171.00
## NA's :75
## Total.Loan AUM.Movement.Slope.last.6.mths
## Min. : 5849 Min. :-5846.26
## 1st Qu.: 34017 1st Qu.: -758.67
## Median : 140071 Median : -261.78
## Mean : 289771 Mean : 162.99
## 3rd Qu.: 316471 3rd Qu.: 2.86
## Max. :1757071 Max. :44420.44
## NA's :87 NA's :2
dim(BankData)
## [1] 100 26
#Check NA
sapply(BankData, function(x) sum(is.na(x)))
## CIF_KEY No
## 0 0
## Age Gender
## 0 0
## Marital.Status Premier.Customer..1..Yes.
## 0 0
## State Race
## 0 0
## Occupation Salary
## 0 1
## All.Deposit.Acc.Balance Count.of.Deposit.Acc
## 0 0
## Count.of.Product.with.Bank Credit.Card.Balance
## 0 91
## Count.of.Credit.Card Fixed.Deposit.Balance
## 89 70
## Housing.Loan.Balance Savings.Acc.Balance
## 91 14
## Internet.Bank.User..1.Yes. Latest.Product.Brough.Date
## 0 0
## Relation.with.Bank..in.month. New.Acct.open.last.6.mths..Count.
## 0 61
## Internet.Bank.Tranx.last.6.mth..count. Total.Asset.Under.Management..AUM.
## 75 0
## Total.Loan AUM.Movement.Slope.last.6.mths
## 87 2
is.integer(BankData$All.Deposit.Acc.Balance)
## [1] FALSE
#Remove the unwanted column and convert the necessary features into factors
BankData <- BankData %>%
dplyr::select(-CIF_KEY, -Gender, -Housing.Loan.Balance, -Credit.Card.Balance, -Total.Loan, -Count.of.Credit.Card, -New.Acct.open.last.6.mths..Count., -Latest.Product.Brough.Date, -Internet.Bank.Tranx.last.6.mth..count., - AUM.Movement.Slope.last.6.mths, -Savings.Acc.Balance ) %>%
mutate(Marital.Status= as.factor(Marital.Status), State=as.factor(State), Race=as.factor(Race), Occupation=as.factor(Occupation), Premier.Customer..1..Yes.=as.factor(Premier.Customer..1..Yes.), Internet.Bank.User..1.Yes.=as.factor(Internet.Bank.User..1.Yes.))
BankData
## No Age Marital.Status Premier.Customer..1..Yes. State Race
## 1 1 71 Single 1 01. KLANG VALLEY CHINESE
## 2 2 68 Married 0 01. KLANG VALLEY CHINESE
## 3 3 65 Married 1 01. KLANG VALLEY CHINESE
## 4 4 60 Married 0 01. KLANG VALLEY CHINESE
## 5 5 71 Single 0 01. KLANG VALLEY INDIAN
## 6 6 46 Married 0 08. PULAU PINANG CHINESE
## 7 7 69 Single 0 04. PAHANG CHINESE
## 8 8 72 Married 0 01. KLANG VALLEY OTHERS
## 9 9 77 Married 0 02. JOHOR CHINESE
## 10 10 28 Single 0 02. JOHOR CHINESE
## 11 11 85 Married 0 05. PERAK CHINESE
## 12 12 70 Married 0 12. SARAWAK CHINESE
## 13 13 59 Married 0 03. MELAKA CHINESE
## 14 14 9 Others 0 05. PERAK CHINESE
## 15 15 45 Single 0 09. KEDAH CHINESE
## 16 16 74 Married 0 08. PULAU PINANG CHINESE
## 17 17 47 Single 0 12. SARAWAK CHINESE
## 18 18 87 Single 0 02. JOHOR CHINESE
## 19 19 64 Single 0 02. JOHOR CHINESE
## 20 20 57 Married 0 05. PERAK CHINESE
## 21 21 58 Married 0 01. KLANG VALLEY INDIAN
## 22 22 33 Single 0 13. SABAH CHINESE
## 23 23 47 Single 0 06. N. SEMBILAN CHINESE
## 24 24 49 Married 0 N/A OTHERS
## 25 25 63 Married 0 12. SARAWAK CHINESE
## 26 26 46 Married 0 02. JOHOR CHINESE
## 27 27 23 Single 0 09. KEDAH INDIAN
## 28 28 45 Married 0 04. PAHANG CHINESE
## 29 29 50 Married 0 01. KLANG VALLEY CHINESE
## 30 30 56 Single 0 02. JOHOR CHINESE
## 31 31 33 Married 0 01. KLANG VALLEY CHINESE
## 32 32 61 Single 0 06. N. SEMBILAN INDIAN
## 33 33 49 Married 1 05. PERAK CHINESE
## 34 34 53 Single 0 03. MELAKA CHINESE
## 35 35 47 Married 0 12. SARAWAK OTHERS
## 36 36 31 Single 0 01. KLANG VALLEY CHINESE
## 37 37 25 Single 0 08. PULAU PINANG BUMIPUTRA
## 38 38 35 Married 0 08. PULAU PINANG CHINESE
## 39 39 43 Single 0 05. PERAK CHINESE
## 40 40 56 Divorced 0 01. KLANG VALLEY BUMIPUTRA
## 41 41 66 Single 0 01. KLANG VALLEY CHINESE
## 42 42 55 Married 0 08. PULAU PINANG CHINESE
## 43 43 79 Married 0 01. KLANG VALLEY CHINESE
## 44 44 48 Single 0 01. KLANG VALLEY CHINESE
## 45 45 49 Married 0 01. KLANG VALLEY BUMIPUTRA
## 46 46 64 Married 0 05. PERAK CHINESE
## 47 47 65 Married 0 01. KLANG VALLEY BUMIPUTRA
## 48 48 60 Single 0 05. PERAK CHINESE
## 49 49 42 Single 0 01. KLANG VALLEY CHINESE
## 50 50 73 Single 0 06. N. SEMBILAN CHINESE
## 51 51 60 Married 0 01. KLANG VALLEY CHINESE
## 52 52 64 Married 0 08. PULAU PINANG CHINESE
## 53 53 70 Married 0 02. JOHOR CHINESE
## 54 54 45 Single 0 01. KLANG VALLEY CHINESE
## 55 55 25 Single 0 02. JOHOR BUMIPUTRA
## 56 56 32 Single 0 02. JOHOR BUMIPUTRA
## 57 57 62 Single 0 12. SARAWAK BUMIPUTRA
## 58 58 56 Married 0 06. N. SEMBILAN CHINESE
## 59 59 47 Single 0 05. PERAK CHINESE
## 60 60 36 Single 0 01. KLANG VALLEY INDIAN
## 61 61 73 Married 0 04. PAHANG CHINESE
## 62 62 60 Married 0 01. KLANG VALLEY BUMIPUTRA
## 63 63 58 Single 0 12. SARAWAK CHINESE
## 64 64 36 Single 0 02. JOHOR CHINESE
## 65 65 50 Single 0 08. PULAU PINANG BUMIPUTRA
## 66 66 22 Single 0 01. KLANG VALLEY CHINESE
## 67 67 30 Single 0 02. JOHOR CHINESE
## 68 68 23 Single 0 08. PULAU PINANG CHINESE
## 69 69 39 Married 0 01. KLANG VALLEY BUMIPUTRA
## 70 70 38 Single 0 13. SABAH INDIAN
## 71 71 70 Married 0 05. PERAK BUMIPUTRA
## 72 72 35 Single 1 06. N. SEMBILAN CHINESE
## 73 73 54 Married 0 01. KLANG VALLEY CHINESE
## 74 74 31 Single 0 02. JOHOR BUMIPUTRA
## 75 75 30 Single 0 N/A OTHERS
## 76 76 45 Widowed 0 01. KLANG VALLEY OTHERS
## 77 77 55 Married 0 01. KLANG VALLEY CHINESE
## 78 78 33 Single 0 04. PAHANG BUMIPUTRA
## 79 79 66 Single 0 12. SARAWAK BUMIPUTRA
## 80 80 24 Single 0 01. KLANG VALLEY BUMIPUTRA
## 81 81 40 Married 0 01. KLANG VALLEY CHINESE
## 82 82 16 Single 0 01. KLANG VALLEY CHINESE
## 83 83 41 Single 0 02. JOHOR CHINESE
## 84 84 48 Married 0 02. JOHOR BUMIPUTRA
## 85 85 36 Married 0 07. TERENGGANU BUMIPUTRA
## 86 86 24 Single 0 13. SABAH CHINESE
## 87 87 31 Single 0 01. KLANG VALLEY OTHERS
## 88 88 36 Married 0 01. KLANG VALLEY CHINESE
## 89 89 5 Single 0 01. KLANG VALLEY CHINESE
## 90 90 10 Single 0 08. PULAU PINANG CHINESE
## 91 91 27 Single 0 13. SABAH BUMIPUTRA
## 92 92 21 Single 0 12. SARAWAK CHINESE
## 93 93 25 Single 0 13. SABAH BUMIPUTRA
## 94 94 23 Single 0 12. SARAWAK BUMIPUTRA
## 95 95 61 Married 0 05. PERAK CHINESE
## 96 96 21 Single 0 07. TERENGGANU BUMIPUTRA
## 97 97 45 Married 0 01. KLANG VALLEY BUMIPUTRA
## 98 98 37 Married 0 07. TERENGGANU BUMIPUTRA
## 99 99 60 Married 0 12. SARAWAK BUMIPUTRA
## 100 100 34 Single 0 05. PERAK CHINESE
## Occupation Salary All.Deposit.Acc.Balance Count.of.Deposit.Acc
## 1 White Collar 4000.000 406,936.3 10
## 2 Others 0.000 346,415.2 9
## 3 White Collar 6200.000 325,803.6 2
## 4 Unknown 0.000 159,000.0 8
## 5 White Collar 0.000 137,410.2 4
## 6 Self Employed 17171.120 119,802.4 6
## 7 Self Employed 0.000 115,002.8 5
## 8 White Collar 8333.000 104,018.8 4
## 9 Unknown 0.000 96,179.3 3
## 10 Blue Collar 0.000 90,474.4 4
## 11 Others 0.000 90,440.6 8
## 12 Retired 0.000 85,059.2 4
## 13 White Collar 0.000 74,137.8 3
## 14 Unknown NA 72,017.6 1
## 15 White Collar 0.000 67,951.2 1
## 16 Others 0.000 58,651.4 1
## 17 White Collar 4543.917 58,216.1 2
## 18 Unknown 0.000 54,477.8 3
## 19 Unknown 0.000 44,833.5 1
## 20 Others 0.000 41,038.8 4
## 21 Unknown 0.000 39,557.1 2
## 22 White Collar 0.000 34,955.2 2
## 23 White Collar 0.000 34,360.0 2
## 24 Self Employed 6812.260 32,011.0 1
## 25 Others 0.000 30,984.7 8
## 26 Self Employed 0.000 30,261.7 2
## 27 Student 0.000 30,005.5 2
## 28 White Collar 7241.090 27,713.6 1
## 29 Unknown 0.000 24,309.8 3
## 30 Others 0.000 24,308.6 2
## 31 Blue Collar 0.000 23,719.6 1
## 32 Self Employed 0.000 21,528.8 2
## 33 Self Employed 14004.000 21,032.9 1
## 34 White Collar 7500.000 20,259.7 1
## 35 Others 0.000 19,481.0 1
## 36 Self Employed 0.000 19,337.6 1
## 37 Blue Collar 0.000 19,249.2 1
## 38 White Collar 0.000 18,190.3 3
## 39 Blue Collar 0.000 17,798.9 3
## 40 Blue Collar 0.000 13,826.2 1
## 41 Unknown 0.000 13,638.4 1
## 42 White Collar 0.000 13,498.7 2
## 43 Unknown 0.000 12,894.1 1
## 44 Unknown 0.000 12,220.4 2
## 45 White Collar 0.000 11,198.3 1
## 46 Unknown 0.000 10,550.0 2
## 47 Unknown 0.000 10,424.5 1
## 48 White Collar 0.000 10,078.5 2
## 49 Blue Collar 0.000 9,971.0 1
## 50 Retired 0.000 9,788.2 3
## 51 White Collar 8333.333 8,377.8 2
## 52 Retired 0.000 8,346.5 2
## 53 Unknown 0.000 7,912.9 1
## 54 White Collar 1500.000 7,768.0 4
## 55 White Collar 8225.360 7,366.3 1
## 56 Blue Collar 5962.420 7,218.9 2
## 57 Retired 0.000 7,083.1 1
## 58 White Collar 4166.667 6,959.7 1
## 59 Unknown 0.000 6,601.3 1
## 60 White Collar 6500.000 6,287.8 1
## 61 Unknown 0.000 6,165.5 1
## 62 Unknown 4583.020 5,973.9 1
## 63 Blue Collar 3700.000 5,291.4 1
## 64 Unknown 0.000 5,049.9 1
## 65 Unknown 0.000 4,857.5 1
## 66 Student 0.000 4,613.2 1
## 67 Blue Collar 10863.660 4,480.3 1
## 68 Blue Collar 2935.800 4,262.6 1
## 69 White Collar 0.000 4,256.6 1
## 70 White Collar 4837.490 4,168.1 3
## 71 Unknown 0.000 4,119.5 1
## 72 Unknown 3949.800 4,036.6 1
## 73 Unknown 0.000 3,886.4 1
## 74 Blue Collar 4114.050 3,831.3 1
## 75 White Collar 5737.620 3,732.8 1
## 76 Others 0.000 3,449.1 1
## 77 White Collar 11125.850 3,124.1 1
## 78 White Collar 3096.900 3,103.7 1
## 79 Others 0.000 3,059.9 1
## 80 Student 0.000 3,016.7 1
## 81 Blue Collar 0.000 2,852.5 1
## 82 Student 0.000 2,653.5 1
## 83 White Collar 6800.580 2,564.1 1
## 84 Self Employed 8742.010 2,534.7 2
## 85 Blue Collar 1884.560 2,452.0 1
## 86 White Collar 4149.930 2,412.1 1
## 87 Blue Collar 0.000 2,286.6 1
## 88 White Collar 3760.260 2,230.3 1
## 89 Student 0.000 2,175.2 1
## 90 Unknown 0.000 2,167.5 1
## 91 White Collar 1681.500 2,162.3 1
## 92 Student 0.000 2,065.7 1
## 93 Student 1641.050 2,059.1 1
## 94 Blue Collar 771.370 2,028.1 1
## 95 Self Employed 10700.000 2,019.9 2
## 96 Student 0.000 2,013.4 1
## 97 White Collar 3300.000 1,997.4 1
## 98 Blue Collar 2327.250 1,980.9 1
## 99 White Collar 1366.667 1,780.4 2
## 100 Blue Collar 0.000 1,750.0 1
## Count.of.Product.with.Bank Fixed.Deposit.Balance Internet.Bank.User..1.Yes.
## 1 2 383800.00 0
## 2 2 340000.00 0
## 3 4 NA 0
## 4 2 159000.00 0
## 5 3 107000.00 0
## 6 4 86398.08 1
## 7 2 113000.00 0
## 8 7 100000.00 1
## 9 1 96179.25 0
## 10 3 85000.00 1
## 11 1 90440.64 0
## 12 3 80548.04 1
## 13 2 71000.00 0
## 14 1 NA 0
## 15 1 NA 0
## 16 1 58651.35 0
## 17 4 58000.00 1
## 18 2 40000.00 0
## 19 1 NA 0
## 20 3 37040.39 0
## 21 2 37640.72 0
## 22 2 NA 1
## 23 2 16267.83 0
## 24 3 NA 1
## 25 2 30913.58 0
## 26 3 30000.00 1
## 27 3 30000.00 1
## 28 2 NA 1
## 29 2 23450.84 0
## 30 3 20000.00 1
## 31 2 NA 1
## 32 3 NA 1
## 33 3 NA 0
## 34 2 NA 0
## 35 1 NA 0
## 36 2 NA 1
## 37 2 NA 1
## 38 3 10025.48 1
## 39 2 12903.78 0
## 40 1 NA 0
## 41 1 NA 0
## 42 2 NA 0
## 43 1 NA 0
## 44 3 12207.14 0
## 45 2 NA 1
## 46 1 NA 0
## 47 1 NA 0
## 48 3 1046.05 1
## 49 1 NA 0
## 50 2 9700.00 0
## 51 4 NA 1
## 52 3 8322.08 0
## 53 1 NA 0
## 54 3 6045.63 1
## 55 2 NA 1
## 56 5 NA 0
## 57 1 NA 0
## 58 2 NA 1
## 59 2 NA 1
## 60 4 NA 1
## 61 1 NA 0
## 62 4 NA 1
## 63 3 NA 0
## 64 1 NA 0
## 65 3 NA 1
## 66 1 NA 0
## 67 1 NA 0
## 68 2 NA 1
## 69 2 NA 1
## 70 7 NA 1
## 71 2 NA 1
## 72 2 NA 1
## 73 2 NA 1
## 74 4 NA 1
## 75 4 NA 1
## 76 2 NA 1
## 77 2 NA 1
## 78 2 NA 1
## 79 1 NA 0
## 80 2 NA 1
## 81 2 NA 1
## 82 1 NA 0
## 83 3 NA 1
## 84 4 NA 1
## 85 2 NA 1
## 86 2 NA 1
## 87 2 NA 1
## 88 2 NA 0
## 89 1 NA 0
## 90 1 NA 0
## 91 4 NA 1
## 92 1 NA 0
## 93 2 NA 1
## 94 2 NA 1
## 95 5 NA 0
## 96 2 NA 1
## 97 4 NA 0
## 98 4 NA 0
## 99 4 NA 1
## 100 2 NA 0
## Relation.with.Bank..in.month. Total.Asset.Under.Management..AUM.
## 1 314 406,936
## 2 684 346,415
## 3 209 325,808
## 4 55 159,000
## 5 212 137,410
## 6 357 119,802
## 7 218 115,003
## 8 261 104,019
## 9 222 96,179
## 10 84 90,474
## 11 256 90,441
## 12 130 85,059
## 13 20 74,138
## 14 103 72,018
## 15 232 67,951
## 16 43 58,651
## 17 268 58,216
## 18 233 54,478
## 19 208 44,833
## 20 361 41,039
## 21 293 39,557
## 22 123 34,955
## 23 309 34,360
## 24 45 32,011
## 25 130 30,985
## 26 40 30,262
## 27 56 30,005
## 28 95 27,714
## 29 169 24,310
## 30 362 24,309
## 31 24 23,720
## 32 381 21,529
## 33 234 21,033
## 34 362 20,260
## 35 161 19,481
## 36 5 19,338
## 37 82 19,249
## 38 10 18,190
## 39 263 17,799
## 40 33 13,826
## 41 445 13,638
## 42 244 13,499
## 43 451 12,894
## 44 287 22,220
## 45 55 11,198
## 46 411 10,550
## 47 684 10,424
## 48 124 10,079
## 49 127 9,971
## 50 395 9,788
## 51 320 8,378
## 52 289 8,347
## 53 502 7,913
## 54 189 7,768
## 55 15 7,366
## 56 108 7,219
## 57 172 7,083
## 58 153 6,960
## 59 375 6,601
## 60 117 6,288
## 61 291 6,165
## 62 378 5,974
## 63 286 5,291
## 64 197 5,050
## 65 233 4,857
## 66 118 4,613
## 67 33 4,480
## 68 4 4,263
## 69 34 4,257
## 70 61 4,168
## 71 231 4,120
## 72 24 4,037
## 73 174 3,886
## 74 49 3,831
## 75 29 3,733
## 76 3 3,449
## 77 29 3,124
## 78 56 3,104
## 79 166 3,060
## 80 10 3,017
## 81 7 2,852
## 82 51 2,654
## 83 34 2,564
## 84 277 2,535
## 85 43 2,452
## 86 40 2,412
## 87 44 2,287
## 88 53 2,230
## 89 34 2,175
## 90 107 2,168
## 91 32 2,162
## 92 130 2,066
## 93 79 2,059
## 94 46 2,028
## 95 220 2,020
## 96 31 2,013
## 97 175 1,997
## 98 2 1,981
## 99 199 1,780
## 100 48 1,750
EDA is done to see the patterns of the data and to extract some insights from it.
ggplot(BankData, aes(Premier.Customer..1..Yes., fill = Premier.Customer..1..Yes.)) +
geom_bar() +
theme(legend.position = 'none')
table(BankData$Premier.Customer..1..Yes.)
##
## 0 1
## 96 4
The number of non-premier customers are larger than premium customers. There are 4 premium customers and 96 non-premier customers.
ggplot(BankData, aes(Internet.Bank.User..1.Yes. , fill = Internet.Bank.User..1.Yes. )) +
geom_bar() +
theme(legend.position = 'none')
table(BankData$Internet.Bank.User..1.Yes.)
##
## 0 1
## 51 49
There is almost the same amount of customers who use internet banking and those who do not use the online banking service.
BankData %>%
dplyr::select(-Internet.Bank.User..1.Yes., -Premier.Customer..1..Yes. ) %>%
keep(is.numeric) %>%
gather() %>%
ggplot() +
geom_histogram(mapping = aes(x=value,fill=key), color="black") +
facet_wrap(~ key, scales = "free") +
theme_minimal() +
theme(legend.position = 'none')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 71 rows containing non-finite values (stat_bin).
From the result we can observe that:
BankData %>%
keep(is.factor) %>%
gather() %>%
group_by(key, value) %>%
summarize(n = n()) %>%
ggplot() +
geom_bar(mapping=aes(x = value, y = n, fill=key), color="black", stat='identity') +
coord_flip() +
facet_wrap(~ key, scales = "free") +
theme_minimal() +
theme(legend.position = 'none')
## Warning: attributes are not identical across measure variables;
## they will be dropped
## `summarise()` regrouping output by 'key' (override with `.groups` argument)
From the graph shown above, we can observe that:
numericVarName <- names(which(sapply(BankData, is.numeric)))
corr <- cor(BankData[,numericVarName], use = 'pairwise.complete.obs')
ggcorrplot(corr, lab = TRUE)
To determine is there are any significance difference between Period of Customer relation with Bank and Occupation using ANOVA table.
#install.packages("ggpubr")
library(ggpubr)
res.aov<- aov(Relation.with.Bank..in.month.~Occupation, BankData)
summary(res.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## Occupation 6 590585 98431 5.921 2.88e-05 ***
## Residuals 93 1546157 16625
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
box_plot <- ggplot(BankData, aes(x = Relation.with.Bank..in.month., y = Occupation))
box_plot +geom_boxplot()
Hypothesis Test
H0 = There is no significant difference between the Period of Customer relation with Bank and the Occupation.
H1 = There is significant difference between the Period of Customer relation with Bank and the Occupation.
p-value 2.88e-05 < 0.05, reject H0.
There is enough evidence to reject H0. Thus, there is significance difference between the Period of Customer relation with the Bank and the Occupation.