library(plyr)
library(tidyverse)
library(reshape2)
library(fastDummies)
library(corrplot)
DATA622_Assignment1_CameronGray
EDA
= read.csv('bank-full.csv', sep=';')
data print(head(data,10))
age job marital education default balance housing loan contact day
1 58 management married tertiary no 2143 yes no unknown 5
2 44 technician single secondary no 29 yes no unknown 5
3 33 entrepreneur married secondary no 2 yes yes unknown 5
4 47 blue-collar married unknown no 1506 yes no unknown 5
5 33 unknown single unknown no 1 no no unknown 5
6 35 management married tertiary no 231 yes no unknown 5
7 28 management single tertiary no 447 yes yes unknown 5
8 42 entrepreneur divorced tertiary yes 2 yes no unknown 5
9 58 retired married primary no 121 yes no unknown 5
10 43 technician single secondary no 593 yes no unknown 5
month duration campaign pdays previous poutcome y
1 may 261 1 -1 0 unknown no
2 may 151 1 -1 0 unknown no
3 may 76 1 -1 0 unknown no
4 may 92 1 -1 0 unknown no
5 may 198 1 -1 0 unknown no
6 may 139 1 -1 0 unknown no
7 may 217 1 -1 0 unknown no
8 may 380 1 -1 0 unknown no
9 may 50 1 -1 0 unknown no
10 may 55 1 -1 0 unknown no
# seeing the top ten most common values for balance to
# identify how many are zero (it's a lot)
|>
data group_by(balance) |>
::summarize(n=n())|>
dplyrarrange(desc(n)) |>
head(10) |>
print()
# A tibble: 10 × 2
balance n
<int> <int>
1 0 3514
2 1 195
3 2 156
4 4 139
5 3 134
6 5 113
7 6 88
8 8 81
9 23 75
10 7 69
#showing all the bar charts in a single graph, not excluding zeros
|>
data as.list() |>
melt() |>
as_tibble() |>
ggplot() +
geom_bar(aes(x=value)) +
facet_wrap(~L1, scales = "free")
# all the box plots individually for easier viewing,
# excluding zero for better axes
for (x in 1:length(data)){
<- data[[x]][data[[x]] != 0 & data[[x]] != -1]
set print(ggplot() + geom_bar(aes(x=set)) + ggtitle(colnames(data)[x]))
}
# boxplot of each feature to see outliers
for (x in 1:length(data)){
if (class(data[[1,x]]) == 'integer'){
print(ggplot()
+ geom_boxplot(aes(y=data[[x]])) + ggtitle(colnames(data)[x]))
} }
# get counts for each balance
<- data |>
balance_count select(balance) |>
group_by(balance) |>
::summarize(count = n())
dplyr
# see bottom ten balance counts
|>
balance_counthead(10) |>
print()
# A tibble: 10 × 2
balance count
<int> <int>
1 -8019 1
2 -6847 1
3 -4057 1
4 -3372 1
5 -3313 1
6 -3058 1
7 -2827 1
8 -2712 1
9 -2604 1
10 -2282 1
# see top ten balance counts
|>
balance_countarrange(desc(balance)) |>
head(10) |>
print()
# A tibble: 10 × 2
balance count
<int> <int>
1 102127 1
2 98417 1
3 81204 2
4 71188 1
5 66721 1
6 66653 1
7 64343 1
8 59649 1
9 58932 1
10 58544 1
# # identifying all the categories for each feature
# # too much output to include
# for (ea in colnames(data)){
# data |>
# select(all_of(ea)) |>
# unique() |>
# print()
# }
# storing all changes to data in data1 to maintain original variable
<- data
data1
# converting all the yes/no features to numeric binary features
for (i in 1:length(data1)){
== "yes"] = 1
data1[[i]][data[[i]] == "no"] = 0
data1[[i]][data[[i]] # print(colnames(data1)[i])
if(data1[[1,i]] %in% c('0','1')){
<- sapply(data1[[i]], as.numeric)
data1[[i]]
}# # used to view the unique categories of each feature
# # too much output to include
# data1[[i]] |>
# unique() |>
# head(10) |>
# print()
}
# print('')
# assigning features to numeric values for those that have logical values
<- data1 |>
data1 mutate(
education = case_when(
== "unknown" ~ 0,
education == "primary" ~ 1,
education == "secondary" ~ 2,
education == "tertiary" ~ 3
education
),month = case_when(
== "jan" ~ 1,
month == "feb" ~ 2,
month == "mar" ~ 3,
month == "apr" ~ 4,
month == "may" ~ 5,
month == "jun" ~ 6,
month == "jul" ~ 7,
month == "aug" ~ 8,
month == "sep" ~ 9,
month == "oct" ~ 10,
month == "nov" ~ 11,
month == "dec" ~ 12
month
)
)
# helps to show all of the unique categories for
# each feature that isn't numeric
for (i in 1:length(data1)){
if (class(data1[[1,i]]) != 'numeric'){
print(colnames(data1)[i])
|>
data1[[i]] unique() |>
print()
} }
[1] "job"
[1] "management" "technician" "entrepreneur" "blue-collar"
[5] "unknown" "retired" "admin." "services"
[9] "self-employed" "unemployed" "housemaid" "student"
[1] "marital"
[1] "married" "single" "divorced"
[1] "contact"
[1] "unknown" "cellular" "telephone"
[1] "poutcome"
[1] "unknown" "failure" "other" "success"
# creating dummy columns to break out
# categorical features and remove old columns
<- data1 |>
dummydata dummy_cols(select_columns = c('job', 'marital', 'contact','poutcome')) |>
select(!job & !marital & !contact & !poutcome)
# correlation graph
<- cor(dummydata)
cordata |>
cordata as.tibble() |>
print()
# A tibble: 35 × 35
age education default balance housing loan day month
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 -0.174 -0.0179 0.0978 -0.186 -0.0157 -0.00912 0.0929
2 -0.174 1 -0.00904 0.0506 -0.0387 0.00431 0.0217 0.0679
3 -0.0179 -0.00904 1 -0.0667 -0.00603 0.0772 0.00942 0.0150
4 0.0978 0.0506 -0.0667 1 -0.0688 -0.0844 0.00450 0.0946
5 -0.186 -0.0387 -0.00603 -0.0688 1 0.0413 -0.0280 -0.174
6 -0.0157 0.00431 0.0772 -0.0844 0.0413 1 0.0114 0.0216
7 -0.00912 0.0217 0.00942 0.00450 -0.0280 0.0114 1 0.102
8 0.0929 0.0679 0.0150 0.0946 -0.174 0.0216 0.102 1
9 -0.00465 0.00255 -0.0100 0.0216 0.00508 -0.0124 -0.0302 -0.0119
10 0.00476 0.000194 0.0168 -0.0146 -0.0236 0.00998 0.162 0.0549
# ℹ 25 more rows
# ℹ 27 more variables: duration <dbl>, campaign <dbl>, pdays <dbl>,
# previous <dbl>, y <dbl>, job_admin. <dbl>, `job_blue-collar` <dbl>,
# job_entrepreneur <dbl>, job_housemaid <dbl>, job_management <dbl>,
# job_retired <dbl>, `job_self-employed` <dbl>, job_services <dbl>,
# job_student <dbl>, job_technician <dbl>, job_unemployed <dbl>,
# job_unknown <dbl>, marital_divorced <dbl>, marital_married <dbl>, …
corrplot.mixed(cordata, tl.pos="lt")
# investigating the relation between pdays and poutcome
|>
data ggplot() +
geom_point(aes(x=pdays, y=poutcome))
# visualizing distribution of pdays for poutcome = success
|>
data select(poutcome, pdays) |>
filter(poutcome == "success" & pdays != -1) |>
ggplot() +
geom_bar(aes(x=pdays))
# visualizing distribution of pdays for poutcome = failure
|>
data select(poutcome, pdays) |>
filter(poutcome == "failure" & pdays != -1) |>
ggplot() +
geom_bar(aes(x=pdays))
# investigating count of poutcome-pdays pairs to see if there
# is a match between poutcome = unknown and pdays = -1
|>
data select(poutcome, pdays) |>
group_by(poutcome, pdays) |>
::summarize(count = n()) |>
dplyrarrange(desc(count)) |>
head(10) |>
print()
# A tibble: 10 × 3
# Groups: poutcome [3]
poutcome pdays count
<chr> <int> <int>
1 unknown -1 36954
2 success 92 94
3 failure 370 80
4 success 182 80
5 failure 182 68
6 success 91 67
7 success 181 64
8 success 183 62
9 failure 350 57
10 failure 183 53
# investigation into pdays and why there are multiple peaks in distribution
|>
data select(pdays) |>
filter(pdays != -1) |>
group_by(pdays) |>
::summarize(n = n()) |>
dplyrarrange(desc(n)) |>
head(10) |>
print()
# A tibble: 10 × 2
pdays n
<int> <int>
1 182 167
2 92 147
3 91 126
4 183 126
5 181 117
6 370 99
7 184 85
8 364 77
9 95 74
10 350 73
# seeing if there are any duplicate rows
|>
data group_by(across(all_of(colnames(data)))) |>
::summarize(n = n()) |>
dplyrarrange(desc(n)) |>
head(5) |>
print()
# A tibble: 5 × 18
# Groups: age, job, marital, education, default, balance, housing, loan,
# contact, day, month, duration, campaign, pdays, previous, poutcome [5]
age job marital education default balance housing loan contact day
<int> <chr> <chr> <chr> <chr> <int> <chr> <chr> <chr> <int>
1 18 student single primary no 608 no no cellular 12
2 18 student single primary no 608 no no cellular 13
3 18 student single primary no 1944 no no telephone 10
4 18 student single secondary no 5 no no cellular 24
5 18 student single secondary no 156 no no cellular 4
# ℹ 8 more variables: month <chr>, duration <int>, campaign <int>, pdays <int>,
# previous <int>, poutcome <chr>, y <chr>, n <int>
Pre-processing
# setting of todays date as days since beginning of year
= 365
today
<- data |>
ppdata mutate(
# adjusting poutcome to have a "new" for when a customer wasn't part of
# previous campaing according to pdays instead of making unknown
poutcome = case_when(pdays == -1 ~ "new",
.default = poutcome),
# converts day and month to number of days since beginning of year
date = case_when(
== "jan" ~ day,
month == "feb" ~ 31 + day,
month == "mar" ~ 59 + day,
month == "apr" ~ 90 + day,
month == "may" ~ 120 + day,
month == "jun" ~ 151 + day,
month == "jul" ~ 181 + day,
month == "aug" ~ 212 + day,
month == "sep" ~ 243 + day,
month == "oct" ~ 273 + day,
month == "nov" ~ 304 + day,
month == "dec" ~ 334 + day
month
),# calculate days since the current date
days_since = case_when(
> today ~ round_any(365 - today + date, 10, f = floor),
date < today ~ round_any(today - date, 10, f=floor)
date
),# bins all continuous data into less granular categories
# some get a max values where all values greater are binned together
age = round_any(age,5,floor),
balance = case_when(
< 25000 ~ round_any(balance, 1000, floor),
balance .default = 25000
),duration = case_when(
< 2000 ~ round_any(duration, 10, floor),
duration .default = 2000
),previous = case_when(
< 20 ~ previous,
previous .default = 20
),campaign = case_when(
< 20 ~ campaign,
campaign .default = 20
),pdays = case_when(
== -1 ~ -1,
pdays < 365 ~ round_any(pdays,10,floor),
pdays .default = 365
),
)
<- ppdata|>
ppdata # remove unnecessary features
select(!day & !month & !date & !poutcome & !contact)
# view a sample of the data
|>
ppdata head(10) |>
print()
age job marital education default balance housing loan duration
1 55 management married tertiary no 2000 yes no 260
2 40 technician single secondary no 0 yes no 150
3 30 entrepreneur married secondary no 0 yes yes 70
4 45 blue-collar married unknown no 1000 yes no 90
5 30 unknown single unknown no 0 no no 190
6 35 management married tertiary no 0 yes no 130
7 25 management single tertiary no 0 yes yes 210
8 40 entrepreneur divorced tertiary yes 0 yes no 380
9 55 retired married primary no 0 yes no 50
10 40 technician single secondary no 0 yes no 50
campaign pdays previous y days_since
1 1 -1 0 no 240
2 1 -1 0 no 240
3 1 -1 0 no 240
4 1 -1 0 no 240
5 1 -1 0 no 240
6 1 -1 0 no 240
7 1 -1 0 no 240
8 1 -1 0 no 240
9 1 -1 0 no 240
10 1 -1 0 no 240
#showing all the bar charts in a single graph, not excluding zeros
|>
ppdata as.list() |>
melt() |>
as_tibble() |>
arrange(value) |>
print() |>
ggplot() +
geom_bar(aes(x=value)) +
facet_wrap(~L1, scales = "free")
# A tibble: 632,954 × 2
value L1
<chr> <chr>
1 -1 pdays
2 -1 pdays
3 -1 pdays
4 -1 pdays
5 -1 pdays
6 -1 pdays
7 -1 pdays
8 -1 pdays
9 -1 pdays
10 -1 pdays
# ℹ 632,944 more rows