DATA622_Assignment1_CameronGray

Author

Cameron Gray

library(plyr)
library(tidyverse)
library(reshape2)
library(fastDummies)
library(corrplot)

EDA

data = read.csv('bank-full.csv', sep=';')
print(head(data,10))
   age          job  marital education default balance housing loan contact day
1   58   management  married  tertiary      no    2143     yes   no unknown   5
2   44   technician   single secondary      no      29     yes   no unknown   5
3   33 entrepreneur  married secondary      no       2     yes  yes unknown   5
4   47  blue-collar  married   unknown      no    1506     yes   no unknown   5
5   33      unknown   single   unknown      no       1      no   no unknown   5
6   35   management  married  tertiary      no     231     yes   no unknown   5
7   28   management   single  tertiary      no     447     yes  yes unknown   5
8   42 entrepreneur divorced  tertiary     yes       2     yes   no unknown   5
9   58      retired  married   primary      no     121     yes   no unknown   5
10  43   technician   single secondary      no     593     yes   no unknown   5
   month duration campaign pdays previous poutcome  y
1    may      261        1    -1        0  unknown no
2    may      151        1    -1        0  unknown no
3    may       76        1    -1        0  unknown no
4    may       92        1    -1        0  unknown no
5    may      198        1    -1        0  unknown no
6    may      139        1    -1        0  unknown no
7    may      217        1    -1        0  unknown no
8    may      380        1    -1        0  unknown no
9    may       50        1    -1        0  unknown no
10   may       55        1    -1        0  unknown no
# seeing the top ten most common values for balance to 
# identify how many are zero (it's a lot)
data |>
  group_by(balance) |>
  dplyr::summarize(n=n())|>
  arrange(desc(n)) |>
  head(10) |>
  print()
# A tibble: 10 × 2
   balance     n
     <int> <int>
 1       0  3514
 2       1   195
 3       2   156
 4       4   139
 5       3   134
 6       5   113
 7       6    88
 8       8    81
 9      23    75
10       7    69
#showing all the bar charts in a single graph, not excluding zeros
data |>
  as.list() |>
  melt() |>
  as_tibble() |>
  ggplot() +
  geom_bar(aes(x=value)) +
  facet_wrap(~L1, scales = "free")

# all the box plots individually for easier viewing, 
# excluding zero for better axes
for (x in 1:length(data)){
  set <- data[[x]][data[[x]] != 0 & data[[x]] != -1]
  print(ggplot() + geom_bar(aes(x=set)) + ggtitle(colnames(data)[x]))
}

# boxplot of each feature to see outliers
for (x in 1:length(data)){
  if (class(data[[1,x]]) == 'integer'){
    print(ggplot() 
          + geom_boxplot(aes(y=data[[x]])) + ggtitle(colnames(data)[x]))
  }
}

# get counts for each balance
balance_count <- data |>
  select(balance) |>
  group_by(balance) |>
  dplyr::summarize(count = n()) 

# see bottom ten balance counts
balance_count|>
  head(10) |>
  print()
# A tibble: 10 × 2
   balance count
     <int> <int>
 1   -8019     1
 2   -6847     1
 3   -4057     1
 4   -3372     1
 5   -3313     1
 6   -3058     1
 7   -2827     1
 8   -2712     1
 9   -2604     1
10   -2282     1
# see top ten balance counts
balance_count|>
  arrange(desc(balance)) |>
  head(10) |>
  print()
# A tibble: 10 × 2
   balance count
     <int> <int>
 1  102127     1
 2   98417     1
 3   81204     2
 4   71188     1
 5   66721     1
 6   66653     1
 7   64343     1
 8   59649     1
 9   58932     1
10   58544     1
# # identifying all the categories for each feature
# # too much output to include
# for (ea in colnames(data)){
#   data |>
#     select(all_of(ea)) |>
#     unique() |>
#     print()
# }
# storing all changes to data in data1 to maintain original variable
data1 <- data

# converting all the yes/no features to numeric binary features
for (i in 1:length(data1)){
  data1[[i]][data[[i]] == "yes"] = 1
  data1[[i]][data[[i]] == "no"] = 0
  # print(colnames(data1)[i])
  if(data1[[1,i]] %in% c('0','1')){
    data1[[i]] <- sapply(data1[[i]], as.numeric)
  }
  # # used to view the unique categories of each feature
  # # too much output to include
  # data1[[i]] |>
  #   unique() |>
  #   head(10) |>
  #   print()
}

# print('')

# assigning features to numeric values for those that have logical values
data1 <- data1 |>
  mutate(
    education = case_when(
      education == "unknown" ~ 0,
      education == "primary" ~ 1,
      education == "secondary" ~ 2,
      education == "tertiary" ~ 3
    ),
    month = case_when(
      month == "jan" ~ 1,
      month == "feb" ~ 2,
      month == "mar" ~ 3,
      month == "apr" ~ 4,
      month == "may" ~ 5,
      month == "jun" ~ 6,
      month == "jul" ~ 7,
      month == "aug" ~ 8,
      month == "sep" ~ 9,
      month == "oct" ~ 10,
      month == "nov" ~ 11,
      month == "dec" ~ 12
    )
  )

# helps to show all of the unique categories for 
# each feature that isn't numeric
for (i in 1:length(data1)){
  if (class(data1[[1,i]]) != 'numeric'){
    print(colnames(data1)[i])
    data1[[i]] |>
      unique() |>
      print()
  }
}
[1] "job"
 [1] "management"    "technician"    "entrepreneur"  "blue-collar"  
 [5] "unknown"       "retired"       "admin."        "services"     
 [9] "self-employed" "unemployed"    "housemaid"     "student"      
[1] "marital"
[1] "married"  "single"   "divorced"
[1] "contact"
[1] "unknown"   "cellular"  "telephone"
[1] "poutcome"
[1] "unknown" "failure" "other"   "success"
# creating dummy columns to break out 
# categorical features and remove old columns
dummydata <- data1 |>
  dummy_cols(select_columns = c('job', 'marital', 'contact','poutcome')) |>
  select(!job & !marital & !contact & !poutcome)

# correlation graph
cordata <- cor(dummydata) 
cordata |> 
  as.tibble() |>
  print()
# A tibble: 35 × 35
        age education  default  balance  housing     loan      day   month
      <dbl>     <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>   <dbl>
 1  1       -0.174    -0.0179   0.0978  -0.186   -0.0157  -0.00912  0.0929
 2 -0.174    1        -0.00904  0.0506  -0.0387   0.00431  0.0217   0.0679
 3 -0.0179  -0.00904   1       -0.0667  -0.00603  0.0772   0.00942  0.0150
 4  0.0978   0.0506   -0.0667   1       -0.0688  -0.0844   0.00450  0.0946
 5 -0.186   -0.0387   -0.00603 -0.0688   1        0.0413  -0.0280  -0.174 
 6 -0.0157   0.00431   0.0772  -0.0844   0.0413   1        0.0114   0.0216
 7 -0.00912  0.0217    0.00942  0.00450 -0.0280   0.0114   1        0.102 
 8  0.0929   0.0679    0.0150   0.0946  -0.174    0.0216   0.102    1     
 9 -0.00465  0.00255  -0.0100   0.0216   0.00508 -0.0124  -0.0302  -0.0119
10  0.00476  0.000194  0.0168  -0.0146  -0.0236   0.00998  0.162    0.0549
# ℹ 25 more rows
# ℹ 27 more variables: duration <dbl>, campaign <dbl>, pdays <dbl>,
#   previous <dbl>, y <dbl>, job_admin. <dbl>, `job_blue-collar` <dbl>,
#   job_entrepreneur <dbl>, job_housemaid <dbl>, job_management <dbl>,
#   job_retired <dbl>, `job_self-employed` <dbl>, job_services <dbl>,
#   job_student <dbl>, job_technician <dbl>, job_unemployed <dbl>,
#   job_unknown <dbl>, marital_divorced <dbl>, marital_married <dbl>, …
corrplot.mixed(cordata, tl.pos="lt")

# investigating the relation between pdays and poutcome
data |>
  ggplot() +
  geom_point(aes(x=pdays, y=poutcome))

# visualizing distribution of pdays for poutcome = success
data |>
  select(poutcome, pdays) |>
  filter(poutcome == "success" & pdays != -1) |>
  ggplot() +
  geom_bar(aes(x=pdays))

# visualizing distribution of pdays for poutcome = failure
data |>
  select(poutcome, pdays) |>
  filter(poutcome == "failure" & pdays != -1) |>
  ggplot() +
  geom_bar(aes(x=pdays))

# investigating count of poutcome-pdays pairs to see if there 
# is a match between poutcome = unknown and pdays = -1
data |>
  select(poutcome, pdays) |>
  group_by(poutcome, pdays) |>
  dplyr::summarize(count = n()) |>
  arrange(desc(count)) |>
  head(10) |>
  print()
# A tibble: 10 × 3
# Groups:   poutcome [3]
   poutcome pdays count
   <chr>    <int> <int>
 1 unknown     -1 36954
 2 success     92    94
 3 failure    370    80
 4 success    182    80
 5 failure    182    68
 6 success     91    67
 7 success    181    64
 8 success    183    62
 9 failure    350    57
10 failure    183    53
# investigation into pdays and why there are multiple peaks in distribution
data |>
  select(pdays) |>
  filter(pdays != -1) |>
  group_by(pdays) |>
  dplyr::summarize(n = n()) |>
  arrange(desc(n)) |>
  head(10) |>
  print()
# A tibble: 10 × 2
   pdays     n
   <int> <int>
 1   182   167
 2    92   147
 3    91   126
 4   183   126
 5   181   117
 6   370    99
 7   184    85
 8   364    77
 9    95    74
10   350    73
# seeing if there are any duplicate rows
data |>
  group_by(across(all_of(colnames(data)))) |>
  dplyr::summarize(n = n()) |>
  arrange(desc(n)) |>
  head(5) |>
  print()
# A tibble: 5 × 18
# Groups:   age, job, marital, education, default, balance, housing, loan,
#   contact, day, month, duration, campaign, pdays, previous, poutcome [5]
    age job     marital education default balance housing loan  contact     day
  <int> <chr>   <chr>   <chr>     <chr>     <int> <chr>   <chr> <chr>     <int>
1    18 student single  primary   no          608 no      no    cellular     12
2    18 student single  primary   no          608 no      no    cellular     13
3    18 student single  primary   no         1944 no      no    telephone    10
4    18 student single  secondary no            5 no      no    cellular     24
5    18 student single  secondary no          156 no      no    cellular      4
# ℹ 8 more variables: month <chr>, duration <int>, campaign <int>, pdays <int>,
#   previous <int>, poutcome <chr>, y <chr>, n <int>

Pre-processing

# setting of todays date as days since beginning of year
today = 365

ppdata <- data |>
  mutate(
    # adjusting poutcome to have a "new" for when a customer wasn't part of 
    # previous campaing according to pdays instead of making unknown
    poutcome = case_when(pdays == -1 ~ "new",
                         .default = poutcome),
    # converts day and month to number of days since beginning of year
    date = case_when(
      month == "jan" ~ day,
      month == "feb" ~ 31 + day,
      month == "mar" ~ 59 + day,
      month == "apr" ~ 90 + day,
      month == "may" ~ 120 + day,
      month == "jun" ~ 151 + day,
      month == "jul" ~ 181 + day,
      month == "aug" ~ 212 + day,
      month == "sep" ~ 243 + day,
      month == "oct" ~ 273 + day,
      month == "nov" ~ 304 + day,
      month == "dec" ~ 334 + day
    ),
    # calculate days since the current date
    days_since = case_when(
      date > today ~ round_any(365 - today + date, 10, f = floor),
      date < today ~ round_any(today - date, 10, f=floor)
    ),
    # bins all continuous data into less granular categories
    # some get a max values where all values greater are binned together
    age = round_any(age,5,floor),
    balance = case_when(
      balance < 25000 ~ round_any(balance, 1000, floor),
      .default = 25000
    ),
    duration = case_when(
      duration < 2000 ~ round_any(duration, 10, floor),
      .default = 2000
    ),
    previous = case_when(
      previous < 20 ~ previous,
      .default = 20
    ),
    campaign = case_when(
      campaign < 20 ~ campaign,
      .default = 20
    ),
    pdays = case_when(
      pdays == -1 ~ -1,
      pdays < 365 ~ round_any(pdays,10,floor),
      .default = 365
    ),
  ) 

ppdata <- ppdata|>
  # remove unnecessary features
  select(!day & !month & !date & !poutcome & !contact)

# view a sample of the data
ppdata |>
  head(10) |>
  print()
   age          job  marital education default balance housing loan duration
1   55   management  married  tertiary      no    2000     yes   no      260
2   40   technician   single secondary      no       0     yes   no      150
3   30 entrepreneur  married secondary      no       0     yes  yes       70
4   45  blue-collar  married   unknown      no    1000     yes   no       90
5   30      unknown   single   unknown      no       0      no   no      190
6   35   management  married  tertiary      no       0     yes   no      130
7   25   management   single  tertiary      no       0     yes  yes      210
8   40 entrepreneur divorced  tertiary     yes       0     yes   no      380
9   55      retired  married   primary      no       0     yes   no       50
10  40   technician   single secondary      no       0     yes   no       50
   campaign pdays previous  y days_since
1         1    -1        0 no        240
2         1    -1        0 no        240
3         1    -1        0 no        240
4         1    -1        0 no        240
5         1    -1        0 no        240
6         1    -1        0 no        240
7         1    -1        0 no        240
8         1    -1        0 no        240
9         1    -1        0 no        240
10        1    -1        0 no        240
#showing all the bar charts in a single graph, not excluding zeros
ppdata |>
  as.list() |>
  melt() |>
  as_tibble() |>
  arrange(value) |>
  print() |>
  ggplot() +
  geom_bar(aes(x=value)) +
  facet_wrap(~L1, scales = "free")
# A tibble: 632,954 × 2
   value L1   
   <chr> <chr>
 1 -1    pdays
 2 -1    pdays
 3 -1    pdays
 4 -1    pdays
 5 -1    pdays
 6 -1    pdays
 7 -1    pdays
 8 -1    pdays
 9 -1    pdays
10 -1    pdays
# ℹ 632,944 more rows