This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.
# Data Processing
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
# Data Wrangling
library(tidyr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stringr)
# Visualization
library(ggplot2)
library(shadowtext)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(ggstatsplot)
## You can cite this package as:
## Patil, I. (2021). Visualizations with statistical details: The 'ggstatsplot' approach.
## Journal of Open Source Software, 6(61), 3167, doi:10.21105/joss.03167
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(grid)
Cek sampling data menggunakan head
untuk melihat
variabel apa saja yang ada pada data
bank <- read.csv("bank.csv", sep = ';', header = TRUE)
head(bank)
Berdasarkan hasil inspeksi data, terdapat 17 kolom.
str(bank)
## 'data.frame': 4521 obs. of 17 variables:
## $ age : int 30 33 35 30 59 35 36 39 41 43 ...
## $ job : chr "unemployed" "services" "management" "management" ...
## $ marital : chr "married" "married" "single" "married" ...
## $ education: chr "primary" "secondary" "tertiary" "tertiary" ...
## $ default : chr "no" "no" "no" "no" ...
## $ balance : int 1787 4789 1350 1476 0 747 307 147 221 -88 ...
## $ housing : chr "no" "yes" "yes" "yes" ...
## $ loan : chr "no" "yes" "no" "yes" ...
## $ contact : chr "cellular" "cellular" "cellular" "unknown" ...
## $ day : int 19 11 16 3 5 23 14 6 14 17 ...
## $ month : chr "oct" "may" "apr" "jun" ...
## $ duration : int 79 220 185 199 226 141 341 151 57 313 ...
## $ campaign : int 1 1 1 4 1 2 1 2 2 1 ...
## $ pdays : int -1 339 330 -1 -1 176 330 -1 -1 147 ...
## $ previous : int 0 4 1 0 0 3 2 0 0 2 ...
## $ poutcome : chr "unknown" "failure" "failure" "unknown" ...
## $ y : chr "no" "no" "no" "no" ...
Menyesuaikan tipe data :
column <- c("job","marital","education","default", "housing", "loan","contact","poutcome", "y")
bank[column] = lapply(X=bank[column], FUN = as.factor)
head(bank[column])
unique(bank$day)
## [1] 19 11 16 3 5 23 14 6 17 20 13 30 29 27 7 18 12 21 26 22 2 4 15 8 28
## [26] 9 1 10 31 25 24
unique(bank$month)
## [1] "oct" "may" "apr" "jun" "feb" "aug" "jan" "jul" "nov" "sep" "mar" "dec"
Buat kolom baru berisi day & month
bank['dayandmonth'] = paste(bank$day, bank$month, '2012', sep=" ")
bank$dayandmonth <-dmy(bank$dayandmonth)
head(bank$dayandmonth)
## [1] "2012-10-19" "2012-05-11" "2012-04-16" "2012-06-03" "2012-05-05"
## [6] "2012-02-23"
Buat kolom baru yearmonth
bank['yearandmonth'] <-substr(bank$dayandmonth,1,7)
bank$yearandmonth <- as.Date(paste0(as.character(bank$yearandmonth), "-01"), format = "%Y-%m-%d")
head(bank$yearandmonth)
## [1] "2012-10-01" "2012-05-01" "2012-04-01" "2012-06-01" "2012-05-01"
## [6] "2012-02-01"
str(bank)
## 'data.frame': 4521 obs. of 19 variables:
## $ age : int 30 33 35 30 59 35 36 39 41 43 ...
## $ job : Factor w/ 12 levels "admin.","blue-collar",..: 11 8 5 5 2 5 7 10 3 8 ...
## $ marital : Factor w/ 3 levels "divorced","married",..: 2 2 3 2 2 3 2 2 2 2 ...
## $ education : Factor w/ 4 levels "primary","secondary",..: 1 2 3 3 2 3 3 2 3 1 ...
## $ default : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ balance : int 1787 4789 1350 1476 0 747 307 147 221 -88 ...
## $ housing : Factor w/ 2 levels "no","yes": 1 2 2 2 2 1 2 2 2 2 ...
## $ loan : Factor w/ 2 levels "no","yes": 1 2 1 2 1 1 1 1 1 2 ...
## $ contact : Factor w/ 3 levels "cellular","telephone",..: 1 1 1 3 3 1 1 1 3 1 ...
## $ day : int 19 11 16 3 5 23 14 6 14 17 ...
## $ month : chr "oct" "may" "apr" "jun" ...
## $ duration : int 79 220 185 199 226 141 341 151 57 313 ...
## $ campaign : int 1 1 1 4 1 2 1 2 2 1 ...
## $ pdays : int -1 339 330 -1 -1 176 330 -1 -1 147 ...
## $ previous : int 0 4 1 0 0 3 2 0 0 2 ...
## $ poutcome : Factor w/ 4 levels "failure","other",..: 4 1 1 4 4 1 2 4 4 1 ...
## $ y : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ dayandmonth : Date, format: "2012-10-19" "2012-05-11" ...
## $ yearandmonth: Date, format: "2012-10-01" "2012-05-01" ...
colSums(is.na(x=bank))
## age job marital education default balance
## 0 0 0 0 0 0
## housing loan contact day month duration
## 0 0 0 0 0 0
## campaign pdays previous poutcome y dayandmonth
## 0 0 0 0 0 0
## yearandmonth
## 0
Tidak ada missing value
sum(duplicated(bank))
## [1] 0
summary(bank)
## age job marital education default
## Min. :19.00 management :969 divorced: 528 primary : 678 no :4445
## 1st Qu.:33.00 blue-collar:946 married :2797 secondary:2306 yes: 76
## Median :39.00 technician :768 single :1196 tertiary :1350
## Mean :41.17 admin. :478 unknown : 187
## 3rd Qu.:49.00 services :417
## Max. :87.00 retired :230
## (Other) :713
## balance housing loan contact day
## Min. :-3313 no :1962 no :3830 cellular :2896 Min. : 1.00
## 1st Qu.: 69 yes:2559 yes: 691 telephone: 301 1st Qu.: 9.00
## Median : 444 unknown :1324 Median :16.00
## Mean : 1423 Mean :15.92
## 3rd Qu.: 1480 3rd Qu.:21.00
## Max. :71188 Max. :31.00
##
## month duration campaign pdays
## Length:4521 Min. : 4 Min. : 1.000 Min. : -1.00
## Class :character 1st Qu.: 104 1st Qu.: 1.000 1st Qu.: -1.00
## Mode :character Median : 185 Median : 2.000 Median : -1.00
## Mean : 264 Mean : 2.794 Mean : 39.77
## 3rd Qu.: 329 3rd Qu.: 3.000 3rd Qu.: -1.00
## Max. :3025 Max. :50.000 Max. :871.00
##
## previous poutcome y dayandmonth
## Min. : 0.0000 failure: 490 no :4000 Min. :2012-01-07
## 1st Qu.: 0.0000 other : 197 yes: 521 1st Qu.:2012-05-11
## Median : 0.0000 success: 129 Median :2012-06-05
## Mean : 0.5426 unknown:3705 Mean :2012-06-20
## 3rd Qu.: 0.0000 3rd Qu.:2012-08-05
## Max. :25.0000 Max. :2012-12-31
##
## yearandmonth
## Min. :2012-01-01
## 1st Qu.:2012-05-01
## Median :2012-06-01
## Mean :2012-06-05
## 3rd Qu.:2012-08-01
## Max. :2012-12-01
##
Tidak ada data duplicated
# Add Col
bank['freq'] = cbind(matrix(1,4521,1))
head(bank$freq)
## [,1]
## [1,] 1
## [2,] 1
## [3,] 1
## [4,] 1
## [5,] 1
## [6,] 1
# Persebaran by Day
freqday = as.data.frame(table(bank$dayandmonth))
freqday$Var1 <- ymd(freqday$Var1)
plot(x = freqday$Var1, y = freqday$Freq, type = 'l', main = "History Marketing Campaigns berdasarkan hari", xlab = 'Date', ylab = 'Jumlah Clients')
# Persebaran by Month
frq_month = as.data.frame(table(bank$yearandmonth))
frq_month$Var1 <- ymd(frq_month$Var1)
plot(x = frq_month$Var1, y = frq_month$Freq, type = 'l', main = "History Marketing Campaigns berdasarkan bulan", xlab = 'Date', ylab = 'Jumlah Clients')
#Correlation
cols <- c("age","balance","duration","campaign","pdays","previous")
boxdt <- bank[cols]
corr <- ggcorrmat(
data = boxdt,
type = "parametric", # Person Correlation
colors = c("Green", "white", "steelblue")
)
corr <- corr + labs(title = NULL, subtitle = NULL) + theme(
plot.margin = margin(0.15, 0, 0.1, 0.01, "npc"))
corr
grid.text(
"Correlation: Numeric Variable",
0,
0.900,
just = c("left", "bottom"),
gp = gpar(
fontsize = 22,
fontface = "bold"
)
)
grid.lines(
x = c(0, 1),
y = 1,
gp = gpar(col = "#e5001c", lwd = 4)
)
grid.rect(
x = 0,
y = 1,
width = 0.05,
height = 0.025,
just = c("left", "top"),
gp = gpar(fill = "#e5001c", lwd = 0)
)