library(dplyr) 
library(knitr)
library(forcats)  #library to replace missing value

(A) About The Dataset

Introduction

The data is related with direct marketing campaigns (phone calls) of a Portuguese banking institution. The classification goal is to predict if the client will subscribe a term deposit (variable y).

Data source: https://archive.ics.uci.edu/ml/datasets/bank+marketing

Reading Data

mydf = read.table('data/dbanks.csv',sep=',',header = T)

Understanding Data

Statistical Summary

summary(mydf)
##       age                 job           marital          education    
##  Min.   :18.00   blue-collar:9844   divorced: 5315   primary  : 6966  
##  1st Qu.:33.00   management :9632   married :27579   secondary:23539  
##  Median :39.00   technician :7720   single  :13027   tertiary :13525  
##  Mean   :40.95   admin.     :5252   NA's    :  646   unknown  : 1879  
##  3rd Qu.:48.00   services   :4210                    NA's     :  658  
##  Max.   :95.00   (Other)    :9260                                     
##  NA's   :641     NA's       : 649                                     
##  default         balance       housing        loan            contact     
##  no  :45125   Min.   : -8019   no  :20400   no  :38591   cellular :29749  
##  yes :  825   1st Qu.:    73   yes :25489   yes : 7324   telephone: 2953  
##  NA's:  617   Median :   448   NA's:  678   NA's:  652   unknown  :13215  
##               Mean   :  1363                             NA's     :  650  
##               3rd Qu.:  1426                                              
##               Max.   :102127                                              
##               NA's   :658                                                 
##       day           month          duration         campaign     
##  Min.   : 1.0   may    :14017   Min.   :   0.0   Min.   : 1.000  
##  1st Qu.: 8.0   jul    : 6989   1st Qu.: 103.0   1st Qu.: 1.000  
##  Median :16.0   aug    : 6359   Median : 180.0   Median : 2.000  
##  Mean   :15.8   jun    : 5426   Mean   : 258.1   Mean   : 2.767  
##  3rd Qu.:21.0   nov    : 4038   3rd Qu.: 319.0   3rd Qu.: 3.000  
##  Max.   :31.0   (Other): 9086   Max.   :4918.0   Max.   :63.000  
##  NA's   :632    NA's   :  652   NA's   :648      NA's   :657     
##      pdays           previous           poutcome        y        
##  Min.   : -1.00   Min.   :  0.0000   failure: 4994   no  :40547  
##  1st Qu.: -1.00   1st Qu.:  0.0000   other  : 1871   yes : 5385  
##  Median : -1.00   Median :  0.0000   success: 1538   NA's:  635  
##  Mean   : 40.31   Mean   :  0.5795   unknown:37551               
##  3rd Qu.: -1.00   3rd Qu.:  0.0000   NA's   :  613               
##  Max.   :871.00   Max.   :275.0000                               
##  NA's   :654      NA's   :649

(B) Data Validation

Check for Duplicate Rows

sum(duplicated(mydf))
## [1] 1450

Check for Missing Data

How Many Rows Contain Missing Data

sum(!complete.cases(mydf))
## [1] 1259

How Many Rows Are Completely Missing Values In All Columns

all.empty = rowSums(is.na(mydf))==ncol(mydf)
sum(all.empty)
## [1] 61

Missing Value By Variable

sapply(mydf, function(x) sum(is.na(x)))
##       age       job   marital education   default   balance   housing 
##       641       649       646       658       617       658       678 
##      loan   contact       day     month  duration  campaign     pdays 
##       652       650       632       652       648       657       654 
##  previous  poutcome         y 
##       649       613       635

(C) Data Cleaning

Remove Rows with All Columns Missing Value

mydf.clean = mydf[!all.empty,]

Remove Duplicated Rows

Saved deduplicated data to new variable ‘mydf.clean’

mydf.clean = mydf.clean %>% distinct

Number of Rows After Dedup

nrow(mydf.clean)
## [1] 45116

Impute Missing Values

Create New Column To Indicate Missing Detection

mydf.clean$missing = !complete.cases(mydf.clean)

Missing Numeric Value Treatment

Replace with Average

mydf.clean$age[is.na(mydf.clean$age)] = mean(mydf$age, na.rm=T)
mydf.clean$day[is.na(mydf.clean$day)] = mean(mydf$day, na.rm=T)
mydf.clean$duration[is.na(mydf.clean$duration)] = mean(mydf$duration, na.rm=T)
mydf.clean$previous[is.na(mydf.clean$previous)] = mean(mydf$previous, na.rm=T)
mydf.clean$campaign[is.na(mydf.clean$campaign)] = mean(mydf$campaign, na.rm=T)

Replace with Mode - Below variables distribution is highly skewed at at specific value, hence we are going to impute missing value with the mode

hist(mydf.clean$balance)

hist(mydf.clean$pdays)

mydf.clean$pdays[is.na(mydf.clean$pdays)] = as.numeric(names(sort(-table(mydf$pdays)))[1])
mydf.clean$balance[is.na(mydf.clean$balance)] = as.numeric(names(sort(-table(mydf$balance)))[1])

Missing Categorical Treatment

Replace with Special Category

mydf.clean$job = fct_explicit_na(mydf.clean$job, "missing")
mydf.clean$marital = fct_explicit_na(mydf.clean$marital, "missing")
mydf.clean$education = fct_explicit_na(mydf.clean$education, "missing")
mydf.clean$default = fct_explicit_na(mydf.clean$default, "missing")
mydf.clean$loan = fct_explicit_na(mydf.clean$loan, "missing")
mydf.clean$contact = fct_explicit_na(mydf.clean$contact, "missing")
mydf.clean$poutcome = fct_explicit_na(mydf.clean$poutcome, "missing")
mydf.clean$y = fct_explicit_na(mydf.clean$y, "missing")
mydf.clean$housing = fct_explicit_na(mydf.clean$housing, "missing")
mydf.clean$month = fct_explicit_na(mydf.clean$month, "missing")

Final Removal of Duplicated Rows

After imputation, certain rows became identical hence need to be deduplicated.

mydf.clean = mydf.clean %>% distinct

(D) Verify Cleanliness

Verify Deduplication

Number of Rows Before Dedup

nrow(mydf)
## [1] 46567

Number of Rows After Dedup

Number of rows had been reduced after deduplication

nrow(mydf.clean)
## [1] 45112

Number of Deduplicated Rows

There is no more duplicated rows

sum(duplicated(mydf.clean))
## [1] 0

Verify Missing Value

All missing value has been treated

sapply(mydf.clean, function(x) sum(is.na(x)))
##       age       job   marital education   default   balance   housing 
##         0         0         0         0         0         0         0 
##      loan   contact       day     month  duration  campaign     pdays 
##         0         0         0         0         0         0         0 
##  previous  poutcome         y   missing 
##         0         0         0         0

New ‘missing’ levels has been introduced

levels(mydf.clean$job)
##  [1] "admin."        "blue-collar"   "entrepreneur"  "housemaid"    
##  [5] "management"    "retired"       "self-employed" "services"     
##  [9] "student"       "technician"    "unemployed"    "unknown"      
## [13] "missing"
levels(mydf.clean$marital)
## [1] "divorced" "married"  "single"   "missing"
levels(mydf.clean$education)
## [1] "primary"   "secondary" "tertiary"  "unknown"   "missing"
levels(mydf.clean$default)
## [1] "no"      "yes"     "missing"
levels(mydf.clean$loan)
## [1] "no"      "yes"     "missing"
levels(mydf.clean$contact)
## [1] "cellular"  "telephone" "unknown"   "missing"
levels(mydf.clean$poutcome)
## [1] "failure" "other"   "success" "unknown" "missing"
levels(mydf.clean$y)
## [1] "no"      "yes"     "missing"
levels(mydf.clean$housing)
## [1] "no"      "yes"     "missing"
levels(mydf.clean$month)
##  [1] "apr"     "aug"     "dec"     "feb"     "jan"     "jul"     "jun"    
##  [8] "mar"     "may"     "nov"     "oct"     "sep"     "missing"

How Many Rows Had Missing Data Before Cleaning

sum(mydf.clean$missing)
## [1] 1121

Display Those Rows That Had Missing Data Before

mydf.clean %>% filter(missing==T) %>% head %>% kable
age job marital education default balance housing loan contact day month duration campaign pdays previous poutcome y missing
28.00000 blue-collar married missing missing 0 yes missing missing 15.80327 may 258.0954 1.00000 -1 0.5794895 missing no TRUE
46.00000 missing missing missing no 0 yes missing missing 15.80327 may 258.0954 2.76663 -1 0.5794895 unknown no TRUE
32.00000 management missing tertiary no 0 missing no missing 5.00000 may 179.0000 1.00000 -1 0.0000000 missing no TRUE
40.94495 missing married tertiary no 523 yes no unknown 5.00000 missing 849.0000 2.00000 -1 0.0000000 missing no TRUE
40.94495 missing married missing no 19 yes no missing 15.80327 may 252.0000 1.00000 -1 0.5794895 unknown missing TRUE
35.00000 services missing secondary no 59 yes no unknown 5.00000 may 1077.0000 2.76663 -1 0.5794895 missing no TRUE

Summary Statistic After Cleaning

summary(mydf.clean)
##       age                 job           marital          education    
##  Min.   :18.00   blue-collar:9595   divorced: 5139   primary  : 6758  
##  1st Qu.:33.00   management :9322   married :26815   secondary:22855  
##  Median :39.00   technician :7485   single  :12629   tertiary :13123  
##  Mean   :40.94   admin.     :5094   missing :  529   unknown  : 1829  
##  3rd Qu.:48.00   services   :4096                    missing  :  547  
##  Max.   :95.00   retired    :2240                                     
##                  (Other)    :7280                                     
##     default         balance          housing           loan      
##  no     :43797   Min.   : -8019   no     :19803   no     :37439  
##  yes    :  809   1st Qu.:    62   yes    :24746   yes    : 7133  
##  missing:  506   Median :   436   missing:  563   missing:  540  
##                  Mean   :  1347                                  
##                  3rd Qu.:  1407                                  
##                  Max.   :102127                                  
##                                                                  
##       contact           day           month          duration     
##  cellular :28876   Min.   : 1.0   may    :13585   Min.   :   0.0  
##  telephone: 2866   1st Qu.: 8.0   jul    : 6785   1st Qu.: 104.0  
##  unknown  :12836   Median :16.0   aug    : 6161   Median : 182.0  
##  missing  :  534   Mean   :15.8   jun    : 5268   Mean   : 258.2  
##                    3rd Qu.:21.0   nov    : 3920   3rd Qu.: 317.0  
##                    Max.   :31.0   apr    : 2884   Max.   :4918.0  
##                                   (Other): 6509                   
##     campaign          pdays           previous           poutcome    
##  Min.   : 1.000   Min.   : -1.00   Min.   :  0.0000   failure: 4847  
##  1st Qu.: 1.000   1st Qu.: -1.00   1st Qu.:  0.0000   other  : 1813  
##  Median : 2.000   Median : -1.00   Median :  0.0000   success: 1486  
##  Mean   : 2.766   Mean   : 39.69   Mean   :  0.5794   unknown:36461  
##  3rd Qu.: 3.000   3rd Qu.: -1.00   3rd Qu.:  0.0000   missing:  505  
##  Max.   :63.000   Max.   :871.00   Max.   :275.0000                  
##                                                                      
##        y          missing       
##  no     :39367   Mode :logical  
##  yes    : 5224   FALSE:43991    
##  missing:  521   TRUE :1121     
##                                 
##                                 
##                                 
##