This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.

1 Importing Libraries

# Data Processing
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
# Data Wrangling
library(tidyr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr   1.1.4     ✔ readr   2.1.5
## ✔ forcats 1.0.0     ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1     ✔ tibble  3.2.1
## ✔ purrr   1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stringr)
# Visualization
library(ggplot2)
library(shadowtext)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(ggstatsplot)
## You can cite this package as:
##      Patil, I. (2021). Visualizations with statistical details: The 'ggstatsplot' approach.
##      Journal of Open Source Software, 6(61), 3167, doi:10.21105/joss.03167
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(grid)

2 Import Data

Cek sampling data menggunakan head untuk melihat variabel apa saja yang ada pada data

bank <- read.csv("bank.csv", sep = ';', header = TRUE)
head(bank)

Berdasarkan hasil inspeksi data, terdapat 17 kolom.

2 Data Processing

A Data Types

str(bank)
## 'data.frame':    4521 obs. of  17 variables:
##  $ age      : int  30 33 35 30 59 35 36 39 41 43 ...
##  $ job      : chr  "unemployed" "services" "management" "management" ...
##  $ marital  : chr  "married" "married" "single" "married" ...
##  $ education: chr  "primary" "secondary" "tertiary" "tertiary" ...
##  $ default  : chr  "no" "no" "no" "no" ...
##  $ balance  : int  1787 4789 1350 1476 0 747 307 147 221 -88 ...
##  $ housing  : chr  "no" "yes" "yes" "yes" ...
##  $ loan     : chr  "no" "yes" "no" "yes" ...
##  $ contact  : chr  "cellular" "cellular" "cellular" "unknown" ...
##  $ day      : int  19 11 16 3 5 23 14 6 14 17 ...
##  $ month    : chr  "oct" "may" "apr" "jun" ...
##  $ duration : int  79 220 185 199 226 141 341 151 57 313 ...
##  $ campaign : int  1 1 1 4 1 2 1 2 2 1 ...
##  $ pdays    : int  -1 339 330 -1 -1 176 330 -1 -1 147 ...
##  $ previous : int  0 4 1 0 0 3 2 0 0 2 ...
##  $ poutcome : chr  "unknown" "failure" "failure" "unknown" ...
##  $ y        : chr  "no" "no" "no" "no" ...

Menyesuaikan tipe data :

column <- c("job","marital","education","default", "housing", "loan","contact","poutcome", "y")
bank[column] = lapply(X=bank[column], FUN = as.factor)
head(bank[column])
unique(bank$day)
##  [1] 19 11 16  3  5 23 14  6 17 20 13 30 29 27  7 18 12 21 26 22  2  4 15  8 28
## [26]  9  1 10 31 25 24
unique(bank$month)
##  [1] "oct" "may" "apr" "jun" "feb" "aug" "jan" "jul" "nov" "sep" "mar" "dec"

Buat kolom baru berisi day & month

bank['dayandmonth'] = paste(bank$day, bank$month, '2012', sep=" ")
bank$dayandmonth <-dmy(bank$dayandmonth)
head(bank$dayandmonth)
## [1] "2012-10-19" "2012-05-11" "2012-04-16" "2012-06-03" "2012-05-05"
## [6] "2012-02-23"

Buat kolom baru yearmonth

bank['yearandmonth'] <-substr(bank$dayandmonth,1,7)
bank$yearandmonth <- as.Date(paste0(as.character(bank$yearandmonth), "-01"), format = "%Y-%m-%d")
head(bank$yearandmonth)
## [1] "2012-10-01" "2012-05-01" "2012-04-01" "2012-06-01" "2012-05-01"
## [6] "2012-02-01"

B Missing Values

str(bank)
## 'data.frame':    4521 obs. of  19 variables:
##  $ age         : int  30 33 35 30 59 35 36 39 41 43 ...
##  $ job         : Factor w/ 12 levels "admin.","blue-collar",..: 11 8 5 5 2 5 7 10 3 8 ...
##  $ marital     : Factor w/ 3 levels "divorced","married",..: 2 2 3 2 2 3 2 2 2 2 ...
##  $ education   : Factor w/ 4 levels "primary","secondary",..: 1 2 3 3 2 3 3 2 3 1 ...
##  $ default     : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ balance     : int  1787 4789 1350 1476 0 747 307 147 221 -88 ...
##  $ housing     : Factor w/ 2 levels "no","yes": 1 2 2 2 2 1 2 2 2 2 ...
##  $ loan        : Factor w/ 2 levels "no","yes": 1 2 1 2 1 1 1 1 1 2 ...
##  $ contact     : Factor w/ 3 levels "cellular","telephone",..: 1 1 1 3 3 1 1 1 3 1 ...
##  $ day         : int  19 11 16 3 5 23 14 6 14 17 ...
##  $ month       : chr  "oct" "may" "apr" "jun" ...
##  $ duration    : int  79 220 185 199 226 141 341 151 57 313 ...
##  $ campaign    : int  1 1 1 4 1 2 1 2 2 1 ...
##  $ pdays       : int  -1 339 330 -1 -1 176 330 -1 -1 147 ...
##  $ previous    : int  0 4 1 0 0 3 2 0 0 2 ...
##  $ poutcome    : Factor w/ 4 levels "failure","other",..: 4 1 1 4 4 1 2 4 4 1 ...
##  $ y           : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ dayandmonth : Date, format: "2012-10-19" "2012-05-11" ...
##  $ yearandmonth: Date, format: "2012-10-01" "2012-05-01" ...
colSums(is.na(x=bank))
##          age          job      marital    education      default      balance 
##            0            0            0            0            0            0 
##      housing         loan      contact          day        month     duration 
##            0            0            0            0            0            0 
##     campaign        pdays     previous     poutcome            y  dayandmonth 
##            0            0            0            0            0            0 
## yearandmonth 
##            0

Tidak ada missing value

C Duplicates

sum(duplicated(bank))
## [1] 0

3 Exploratory Data Analysis

summary(bank)
##       age                 job          marital         education    default   
##  Min.   :19.00   management :969   divorced: 528   primary  : 678   no :4445  
##  1st Qu.:33.00   blue-collar:946   married :2797   secondary:2306   yes:  76  
##  Median :39.00   technician :768   single  :1196   tertiary :1350             
##  Mean   :41.17   admin.     :478                   unknown  : 187             
##  3rd Qu.:49.00   services   :417                                              
##  Max.   :87.00   retired    :230                                              
##                  (Other)    :713                                              
##     balance      housing     loan           contact          day       
##  Min.   :-3313   no :1962   no :3830   cellular :2896   Min.   : 1.00  
##  1st Qu.:   69   yes:2559   yes: 691   telephone: 301   1st Qu.: 9.00  
##  Median :  444                         unknown  :1324   Median :16.00  
##  Mean   : 1423                                          Mean   :15.92  
##  3rd Qu.: 1480                                          3rd Qu.:21.00  
##  Max.   :71188                                          Max.   :31.00  
##                                                                        
##     month              duration       campaign          pdays       
##  Length:4521        Min.   :   4   Min.   : 1.000   Min.   : -1.00  
##  Class :character   1st Qu.: 104   1st Qu.: 1.000   1st Qu.: -1.00  
##  Mode  :character   Median : 185   Median : 2.000   Median : -1.00  
##                     Mean   : 264   Mean   : 2.794   Mean   : 39.77  
##                     3rd Qu.: 329   3rd Qu.: 3.000   3rd Qu.: -1.00  
##                     Max.   :3025   Max.   :50.000   Max.   :871.00  
##                                                                     
##     previous          poutcome      y         dayandmonth        
##  Min.   : 0.0000   failure: 490   no :4000   Min.   :2012-01-07  
##  1st Qu.: 0.0000   other  : 197   yes: 521   1st Qu.:2012-05-11  
##  Median : 0.0000   success: 129              Median :2012-06-05  
##  Mean   : 0.5426   unknown:3705              Mean   :2012-06-20  
##  3rd Qu.: 0.0000                             3rd Qu.:2012-08-05  
##  Max.   :25.0000                             Max.   :2012-12-31  
##                                                                  
##   yearandmonth       
##  Min.   :2012-01-01  
##  1st Qu.:2012-05-01  
##  Median :2012-06-01  
##  Mean   :2012-06-05  
##  3rd Qu.:2012-08-01  
##  Max.   :2012-12-01  
## 

Tidak ada data duplicated

# Add Col 
bank['freq'] = cbind(matrix(1,4521,1))
head(bank$freq)
##      [,1]
## [1,]    1
## [2,]    1
## [3,]    1
## [4,]    1
## [5,]    1
## [6,]    1

4 Exploratory Visualization

# Persebaran by Day
freqday = as.data.frame(table(bank$dayandmonth))
freqday$Var1 <- ymd(freqday$Var1)
plot(x = freqday$Var1, y = freqday$Freq, type = 'l', main = "History Marketing Campaigns berdasarkan hari", xlab = 'Date', ylab = 'Jumlah Clients')

# Persebaran by Month
frq_month = as.data.frame(table(bank$yearandmonth))
frq_month$Var1 <- ymd(frq_month$Var1)
plot(x = frq_month$Var1, y = frq_month$Freq, type = 'l', main = "History Marketing Campaigns berdasarkan bulan", xlab = 'Date', ylab = 'Jumlah Clients')

#Correlation
cols <- c("age","balance","duration","campaign","pdays","previous")
boxdt <- bank[cols]
corr <- ggcorrmat(
  data = boxdt,
  type = "parametric", # Person Correlation
  colors = c("Green", "white", "steelblue")
)

corr <- corr +  labs(title = NULL, subtitle = NULL) + theme(
  plot.margin = margin(0.15, 0, 0.1, 0.01, "npc")) 

corr

grid.text(
  "Correlation: Numeric Variable", 
  0, 
  0.900,
  just = c("left", "bottom"),
  gp = gpar(
    fontsize = 22,
    fontface = "bold"
  )
)

grid.lines(
  x = c(0, 1),
  y = 1,
  gp = gpar(col = "#e5001c", lwd = 4)
)

grid.rect(
  x = 0,
  y = 1,
  width = 0.05,
  height = 0.025,
  just = c("left", "top"),
  gp = gpar(fill = "#e5001c", lwd = 0)
)