library(dplyr)
library(knitr)
library(forcats) #library to replace missing value
The data is related with direct marketing campaigns (phone calls) of a Portuguese banking institution. The classification goal is to predict if the client will subscribe a term deposit (variable y).
Data source: https://archive.ics.uci.edu/ml/datasets/bank+marketing
mydf = read.table('data/dbanks.csv',sep=',',header = T)
Statistical Summary
summary(mydf)
## age job marital education
## Min. :18.00 blue-collar:9844 divorced: 5315 primary : 6966
## 1st Qu.:33.00 management :9632 married :27579 secondary:23539
## Median :39.00 technician :7720 single :13027 tertiary :13525
## Mean :40.95 admin. :5252 NA's : 646 unknown : 1879
## 3rd Qu.:48.00 services :4210 NA's : 658
## Max. :95.00 (Other) :9260
## NA's :641 NA's : 649
## default balance housing loan contact
## no :45125 Min. : -8019 no :20400 no :38591 cellular :29749
## yes : 825 1st Qu.: 73 yes :25489 yes : 7324 telephone: 2953
## NA's: 617 Median : 448 NA's: 678 NA's: 652 unknown :13215
## Mean : 1363 NA's : 650
## 3rd Qu.: 1426
## Max. :102127
## NA's :658
## day month duration campaign
## Min. : 1.0 may :14017 Min. : 0.0 Min. : 1.000
## 1st Qu.: 8.0 jul : 6989 1st Qu.: 103.0 1st Qu.: 1.000
## Median :16.0 aug : 6359 Median : 180.0 Median : 2.000
## Mean :15.8 jun : 5426 Mean : 258.1 Mean : 2.767
## 3rd Qu.:21.0 nov : 4038 3rd Qu.: 319.0 3rd Qu.: 3.000
## Max. :31.0 (Other): 9086 Max. :4918.0 Max. :63.000
## NA's :632 NA's : 652 NA's :648 NA's :657
## pdays previous poutcome y
## Min. : -1.00 Min. : 0.0000 failure: 4994 no :40547
## 1st Qu.: -1.00 1st Qu.: 0.0000 other : 1871 yes : 5385
## Median : -1.00 Median : 0.0000 success: 1538 NA's: 635
## Mean : 40.31 Mean : 0.5795 unknown:37551
## 3rd Qu.: -1.00 3rd Qu.: 0.0000 NA's : 613
## Max. :871.00 Max. :275.0000
## NA's :654 NA's :649
sum(duplicated(mydf))
## [1] 1450
How Many Rows Contain Missing Data
sum(!complete.cases(mydf))
## [1] 1259
How Many Rows Are Completely Missing Values In All Columns
all.empty = rowSums(is.na(mydf))==ncol(mydf)
sum(all.empty)
## [1] 61
Missing Value By Variable
sapply(mydf, function(x) sum(is.na(x)))
## age job marital education default balance housing
## 641 649 646 658 617 658 678
## loan contact day month duration campaign pdays
## 652 650 632 652 648 657 654
## previous poutcome y
## 649 613 635
mydf.clean = mydf[!all.empty,]
Saved deduplicated data to new variable ‘mydf.clean’
mydf.clean = mydf.clean %>% distinct
Number of Rows After Dedup
nrow(mydf.clean)
## [1] 45116
Create New Column To Indicate Missing Detection
mydf.clean$missing = !complete.cases(mydf.clean)
Missing Numeric Value Treatment
Replace with Average
mydf.clean$age[is.na(mydf.clean$age)] = mean(mydf$age, na.rm=T)
mydf.clean$day[is.na(mydf.clean$day)] = mean(mydf$day, na.rm=T)
mydf.clean$duration[is.na(mydf.clean$duration)] = mean(mydf$duration, na.rm=T)
mydf.clean$previous[is.na(mydf.clean$previous)] = mean(mydf$previous, na.rm=T)
mydf.clean$campaign[is.na(mydf.clean$campaign)] = mean(mydf$campaign, na.rm=T)
Replace with Mode - Below variables distribution is highly skewed at at specific value, hence we are going to impute missing value with the mode
hist(mydf.clean$balance)
hist(mydf.clean$pdays)
mydf.clean$pdays[is.na(mydf.clean$pdays)] = as.numeric(names(sort(-table(mydf$pdays)))[1])
mydf.clean$balance[is.na(mydf.clean$balance)] = as.numeric(names(sort(-table(mydf$balance)))[1])
Missing Categorical Treatment
Replace with Special Category
mydf.clean$job = fct_explicit_na(mydf.clean$job, "missing")
mydf.clean$marital = fct_explicit_na(mydf.clean$marital, "missing")
mydf.clean$education = fct_explicit_na(mydf.clean$education, "missing")
mydf.clean$default = fct_explicit_na(mydf.clean$default, "missing")
mydf.clean$loan = fct_explicit_na(mydf.clean$loan, "missing")
mydf.clean$contact = fct_explicit_na(mydf.clean$contact, "missing")
mydf.clean$poutcome = fct_explicit_na(mydf.clean$poutcome, "missing")
mydf.clean$y = fct_explicit_na(mydf.clean$y, "missing")
mydf.clean$housing = fct_explicit_na(mydf.clean$housing, "missing")
mydf.clean$month = fct_explicit_na(mydf.clean$month, "missing")
After imputation, certain rows became identical hence need to be deduplicated.
mydf.clean = mydf.clean %>% distinct
Number of Rows Before Dedup
nrow(mydf)
## [1] 46567
Number of Rows After Dedup
Number of rows had been reduced after deduplication
nrow(mydf.clean)
## [1] 45112
Number of Deduplicated Rows
There is no more duplicated rows
sum(duplicated(mydf.clean))
## [1] 0
All missing value has been treated
sapply(mydf.clean, function(x) sum(is.na(x)))
## age job marital education default balance housing
## 0 0 0 0 0 0 0
## loan contact day month duration campaign pdays
## 0 0 0 0 0 0 0
## previous poutcome y missing
## 0 0 0 0
New ‘missing’ levels has been introduced
levels(mydf.clean$job)
## [1] "admin." "blue-collar" "entrepreneur" "housemaid"
## [5] "management" "retired" "self-employed" "services"
## [9] "student" "technician" "unemployed" "unknown"
## [13] "missing"
levels(mydf.clean$marital)
## [1] "divorced" "married" "single" "missing"
levels(mydf.clean$education)
## [1] "primary" "secondary" "tertiary" "unknown" "missing"
levels(mydf.clean$default)
## [1] "no" "yes" "missing"
levels(mydf.clean$loan)
## [1] "no" "yes" "missing"
levels(mydf.clean$contact)
## [1] "cellular" "telephone" "unknown" "missing"
levels(mydf.clean$poutcome)
## [1] "failure" "other" "success" "unknown" "missing"
levels(mydf.clean$y)
## [1] "no" "yes" "missing"
levels(mydf.clean$housing)
## [1] "no" "yes" "missing"
levels(mydf.clean$month)
## [1] "apr" "aug" "dec" "feb" "jan" "jul" "jun"
## [8] "mar" "may" "nov" "oct" "sep" "missing"
How Many Rows Had Missing Data Before Cleaning
sum(mydf.clean$missing)
## [1] 1121
Display Those Rows That Had Missing Data Before
mydf.clean %>% filter(missing==T) %>% head %>% kable
| age | job | marital | education | default | balance | housing | loan | contact | day | month | duration | campaign | pdays | previous | poutcome | y | missing |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 28.00000 | blue-collar | married | missing | missing | 0 | yes | missing | missing | 15.80327 | may | 258.0954 | 1.00000 | -1 | 0.5794895 | missing | no | TRUE |
| 46.00000 | missing | missing | missing | no | 0 | yes | missing | missing | 15.80327 | may | 258.0954 | 2.76663 | -1 | 0.5794895 | unknown | no | TRUE |
| 32.00000 | management | missing | tertiary | no | 0 | missing | no | missing | 5.00000 | may | 179.0000 | 1.00000 | -1 | 0.0000000 | missing | no | TRUE |
| 40.94495 | missing | married | tertiary | no | 523 | yes | no | unknown | 5.00000 | missing | 849.0000 | 2.00000 | -1 | 0.0000000 | missing | no | TRUE |
| 40.94495 | missing | married | missing | no | 19 | yes | no | missing | 15.80327 | may | 252.0000 | 1.00000 | -1 | 0.5794895 | unknown | missing | TRUE |
| 35.00000 | services | missing | secondary | no | 59 | yes | no | unknown | 5.00000 | may | 1077.0000 | 2.76663 | -1 | 0.5794895 | missing | no | TRUE |
Summary Statistic After Cleaning
summary(mydf.clean)
## age job marital education
## Min. :18.00 blue-collar:9595 divorced: 5139 primary : 6758
## 1st Qu.:33.00 management :9322 married :26815 secondary:22855
## Median :39.00 technician :7485 single :12629 tertiary :13123
## Mean :40.94 admin. :5094 missing : 529 unknown : 1829
## 3rd Qu.:48.00 services :4096 missing : 547
## Max. :95.00 retired :2240
## (Other) :7280
## default balance housing loan
## no :43797 Min. : -8019 no :19803 no :37439
## yes : 809 1st Qu.: 62 yes :24746 yes : 7133
## missing: 506 Median : 436 missing: 563 missing: 540
## Mean : 1347
## 3rd Qu.: 1407
## Max. :102127
##
## contact day month duration
## cellular :28876 Min. : 1.0 may :13585 Min. : 0.0
## telephone: 2866 1st Qu.: 8.0 jul : 6785 1st Qu.: 104.0
## unknown :12836 Median :16.0 aug : 6161 Median : 182.0
## missing : 534 Mean :15.8 jun : 5268 Mean : 258.2
## 3rd Qu.:21.0 nov : 3920 3rd Qu.: 317.0
## Max. :31.0 apr : 2884 Max. :4918.0
## (Other): 6509
## campaign pdays previous poutcome
## Min. : 1.000 Min. : -1.00 Min. : 0.0000 failure: 4847
## 1st Qu.: 1.000 1st Qu.: -1.00 1st Qu.: 0.0000 other : 1813
## Median : 2.000 Median : -1.00 Median : 0.0000 success: 1486
## Mean : 2.766 Mean : 39.69 Mean : 0.5794 unknown:36461
## 3rd Qu.: 3.000 3rd Qu.: -1.00 3rd Qu.: 0.0000 missing: 505
## Max. :63.000 Max. :871.00 Max. :275.0000
##
## y missing
## no :39367 Mode :logical
## yes : 5224 FALSE:43991
## missing: 521 TRUE :1121
##
##
##
##