1. LOAD LIBRARIES

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

2. LOAD DATA

bank <- read.csv('/Users/regishamanandhar/Documents/DCS 402/bank.csv')
str(bank)
## 'data.frame':    11162 obs. of  17 variables:
##  $ age      : int  59 56 41 55 54 42 56 60 37 28 ...
##  $ job      : chr  "admin." "admin." "technician" "services" ...
##  $ marital  : chr  "married" "married" "married" "married" ...
##  $ education: chr  "secondary" "secondary" "secondary" "secondary" ...
##  $ default  : chr  "no" "no" "no" "no" ...
##  $ balance  : int  2343 45 1270 2476 184 0 830 545 1 5090 ...
##  $ housing  : chr  "yes" "no" "yes" "yes" ...
##  $ loan     : chr  "no" "no" "no" "no" ...
##  $ contact  : chr  "unknown" "unknown" "unknown" "unknown" ...
##  $ day      : int  5 5 5 5 5 5 6 6 6 6 ...
##  $ month    : chr  "may" "may" "may" "may" ...
##  $ duration : int  1042 1467 1389 579 673 562 1201 1030 608 1297 ...
##  $ campaign : int  1 1 1 1 2 2 1 1 1 3 ...
##  $ pdays    : int  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ previous : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome : chr  "unknown" "unknown" "unknown" "unknown" ...
##  $ deposit  : chr  "yes" "yes" "yes" "yes" ...
summary(bank)
##       age            job              marital           education        
##  Min.   :18.00   Length:11162       Length:11162       Length:11162      
##  1st Qu.:32.00   Class :character   Class :character   Class :character  
##  Median :39.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :41.23                                                           
##  3rd Qu.:49.00                                                           
##  Max.   :95.00                                                           
##    default             balance        housing              loan          
##  Length:11162       Min.   :-6847   Length:11162       Length:11162      
##  Class :character   1st Qu.:  122   Class :character   Class :character  
##  Mode  :character   Median :  550   Mode  :character   Mode  :character  
##                     Mean   : 1529                                        
##                     3rd Qu.: 1708                                        
##                     Max.   :81204                                        
##    contact               day           month              duration   
##  Length:11162       Min.   : 1.00   Length:11162       Min.   :   2  
##  Class :character   1st Qu.: 8.00   Class :character   1st Qu.: 138  
##  Mode  :character   Median :15.00   Mode  :character   Median : 255  
##                     Mean   :15.66                      Mean   : 372  
##                     3rd Qu.:22.00                      3rd Qu.: 496  
##                     Max.   :31.00                      Max.   :3881  
##     campaign          pdays           previous         poutcome        
##  Min.   : 1.000   Min.   : -1.00   Min.   : 0.0000   Length:11162      
##  1st Qu.: 1.000   1st Qu.: -1.00   1st Qu.: 0.0000   Class :character  
##  Median : 2.000   Median : -1.00   Median : 0.0000   Mode  :character  
##  Mean   : 2.508   Mean   : 51.33   Mean   : 0.8326                     
##  3rd Qu.: 3.000   3rd Qu.: 20.75   3rd Qu.: 1.0000                     
##  Max.   :63.000   Max.   :854.00   Max.   :58.0000                     
##    deposit         
##  Length:11162      
##  Class :character  
##  Mode  :character  
##                    
##                    
## 

3. DATA CLEANING

3.1 NULL VALUE ASSESSMENT

missing_values <- colSums(is.na(bank))

missing_df <- data.frame(
  column = names(missing_values),
  missing_count = missing_values
)
missing_df
##              column missing_count
## age             age             0
## job             job             0
## marital     marital             0
## education education             0
## default     default             0
## balance     balance             0
## housing     housing             0
## loan           loan             0
## contact     contact             0
## day             day             0
## month         month             0
## duration   duration             0
## campaign   campaign             0
## pdays         pdays             0
## previous   previous             0
## poutcome   poutcome             0
## deposit     deposit             0

3.2 HANDLING INCONSISTENT DATA

bank[bank == "unknown"] <- NA

3.3 HANDLING MISSING VALUES

missing_after <- colSums(is.na(bank))

missing_df_after <- data.frame(
  column = names(missing_after),
  missing_count = missing_after
)
missing_df_after
##              column missing_count
## age             age             0
## job             job            70
## marital     marital             0
## education education           497
## default     default             0
## balance     balance             0
## housing     housing             0
## loan           loan             0
## contact     contact          2346
## day             day             0
## month         month             0
## duration   duration             0
## campaign   campaign             0
## pdays         pdays             0
## previous   previous             0
## poutcome   poutcome          8326
## deposit     deposit             0
ggplot(missing_df_after, aes(x = column, y = missing_count)) +
  geom_bar(stat = "identity", fill = "orange") +
  theme_minimal() +
  labs(title = "Missing Values After Handling 'unknown'",
       x = "Columns",
       y = "Missing Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

bank_clean <- na.omit(bank)

3.4 OUTLIER DETECTION (IQR METHOD)

Q1 <- quantile(bank_clean$balance, 0.25)
Q3 <- quantile(bank_clean$balance, 0.75)
IQR_value <- Q3 - Q1

lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value

outliers <- bank_clean %>%
  filter(balance < lower_bound | balance > upper_bound)
head (outliers,10)
##    age         job marital education default balance housing loan  contact day
## 1   37  technician married secondary      no    5115     yes   no cellular  17
## 2   33  technician married  tertiary      no    6843      no   no cellular  20
## 3   50 blue-collar married   primary      no   12519     yes   no cellular  21
## 4   59  management married  tertiary      no    7049      no   no cellular  21
## 5   31  management married secondary      no    8629     yes   no cellular  21
## 6   31  management  single  tertiary      no   12857     yes   no cellular   6
## 7   40 blue-collar married secondary      no    5060      no   no cellular  10
## 8   30  management  single  tertiary      no    5561     yes   no cellular  27
## 9   30  management married  tertiary      no    8089     yes   no cellular  18
## 10  38  management  single  tertiary      no    6158     yes   no cellular  15
##    month duration campaign pdays previous poutcome deposit
## 1    nov     1210        2   171        4  failure     yes
## 2    nov      755        1   100       10    other     yes
## 3    nov      615        3    34        1  failure     yes
## 4    nov      530        1   163        2  failure     yes
## 5    nov      957        1   184        2  failure     yes
## 6    feb      158        1    92        1  success     yes
## 7    feb      154        2    93        1  success     yes
## 8    feb      195        1   100        1  success     yes
## 9    mar      232        2   294        2  failure     yes
## 10   apr      139        5   145        9  failure     yes
ggplot(bank_clean, aes(y = balance)) +
  geom_boxplot(fill = "red") +
  theme_minimal() +
  labs(title = "Outliers in Balance (Before Filtering)")

# 3.5 FILTERING OUTLIERS

bank_no_outliers <- bank_clean %>%
  filter(balance >= lower_bound & balance <= upper_bound)
head (bank_no_outliers, 10)
##    age          job  marital education default balance housing loan   contact
## 1   42       admin.   single secondary      no    -247     yes  yes telephone
## 2   33     services  married secondary      no    3444     yes   no telephone
## 3   53      retired  married  tertiary      no    2269      no   no  cellular
## 4   45 entrepreneur  married secondary      no     781      no  yes  cellular
## 5   34   management   single  tertiary      no    1494     yes   no  cellular
## 6   46   management  married  tertiary      no       0      no   no  cellular
## 7   43   management  married  tertiary      no    1429     yes   no  cellular
## 8   33   technician   single  tertiary      no     149     yes   no  cellular
## 9   46   unemployed divorced secondary      no    3354     yes   no  cellular
## 10  38  blue-collar  married   primary      no     190     yes   no telephone
##    day month duration campaign pdays previous poutcome deposit
## 1   21   oct      519        1   166        1    other     yes
## 2   21   oct      144        1    91        4  failure     yes
## 3   17   nov     1091        2   150        1  success     yes
## 4   17   nov      652        2   126        2  failure     yes
## 5   18   nov      596        1   182        1    other     yes
## 6   18   nov      716        2   110        3    other     yes
## 7   19   nov     1015        1   198        2    other     yes
## 8   19   nov      424        2   182        1    other     yes
## 9   19   nov      522        1   174        1  success     yes
## 10  19   nov      623        1   175        1    other     yes
ggplot(bank_no_outliers, aes(y = balance)) +
  geom_boxplot(fill = "red") +
  theme_minimal() +
  labs(title = "Balance After Outlier Removal")

4. DATA TRANSFORMATION

4.1 QUALITATIVE TO QUANTITATIVE TRANSFORMATION

bank_transformed <- bank_no_outliers %>%
  mutate(
    housing_numeric = ifelse(housing == "yes", 1, 0),
    loan_numeric = ifelse(loan == "yes", 1, 0),
    deposit_numeric = ifelse(deposit == "yes", 1, 0),
    job_numeric = as.numeric(as.factor(job)),
    education_numeric = as.numeric(as.factor(education)),
    marital_numeric = as.numeric(as.factor(marital))
  )

str(bank_transformed)
## 'data.frame':    2444 obs. of  23 variables:
##  $ age              : int  42 33 53 45 34 46 43 33 46 38 ...
##  $ job              : chr  "admin." "services" "retired" "entrepreneur" ...
##  $ marital          : chr  "single" "married" "married" "married" ...
##  $ education        : chr  "secondary" "secondary" "tertiary" "secondary" ...
##  $ default          : chr  "no" "no" "no" "no" ...
##  $ balance          : int  -247 3444 2269 781 1494 0 1429 149 3354 190 ...
##  $ housing          : chr  "yes" "yes" "no" "no" ...
##  $ loan             : chr  "yes" "no" "no" "yes" ...
##  $ contact          : chr  "telephone" "telephone" "cellular" "cellular" ...
##  $ day              : int  21 21 17 17 18 18 19 19 19 19 ...
##  $ month            : chr  "oct" "oct" "nov" "nov" ...
##  $ duration         : int  519 144 1091 652 596 716 1015 424 522 623 ...
##  $ campaign         : int  1 1 2 2 1 2 1 2 1 1 ...
##  $ pdays            : int  166 91 150 126 182 110 198 182 174 175 ...
##  $ previous         : int  1 4 1 2 1 3 2 1 1 1 ...
##  $ poutcome         : chr  "other" "failure" "success" "failure" ...
##  $ deposit          : chr  "yes" "yes" "yes" "yes" ...
##  $ housing_numeric  : num  1 1 0 0 1 0 1 1 1 1 ...
##  $ loan_numeric     : num  1 0 0 1 0 0 0 0 0 0 ...
##  $ deposit_numeric  : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ job_numeric      : num  1 8 6 3 5 5 5 10 11 2 ...
##  $ education_numeric: num  2 2 3 2 3 3 3 3 2 1 ...
##  $ marital_numeric  : num  3 2 2 2 3 2 2 3 1 2 ...
##  - attr(*, "na.action")= 'omit' Named int [1:8487] 1 2 3 4 5 6 7 8 9 10 ...
##   ..- attr(*, "names")= chr [1:8487] "1" "2" "3" "4" ...

4.3 VISUALIZATION OF QUALITATIVE → QUANTITATIVE TRANSFORMATION

Example 1: Deposit Before Transformation (Categorical)

ggplot(bank_no_outliers, aes(x = deposit)) +
  geom_bar(fill = "purple") +
  theme_minimal() +
  labs(title = "Deposit (Categorical Representation)",
       x = "Deposit (Yes/No)",
       y = "Count")

# Example 2: Deposit After Transformation (Numeric)

ggplot(bank_transformed, aes(x = factor(deposit_numeric))) +
  geom_bar(fill = "orange") +
  theme_minimal() +
  labs(title = "Deposit After Numeric Encoding",
       x = "Deposit (0 = No, 1 = Yes)",
       y = "Count")

# Example 3: Job Before Transformation

ggplot(bank_no_outliers, aes(x = job)) +
  geom_bar(fill = "darkgreen") +
  theme_minimal() +
  labs(title = "Job Categories (Original)",
       x = "Job",
       y = "Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Example 4: Job After Numeric Encoding

ggplot(bank_transformed, aes(x = factor(job_numeric))) +
  geom_bar(fill = "skyblue") +
  theme_minimal() +
  labs(title = "Job Categories After Numeric Encoding",
       x = "Encoded Job Category",
       y = "Count")

5. EXPLORATORY DATA ANALYSIS (EDA)

5.1 Average Balance by Job

avg_balance_job <- bank_transformed %>%
  group_by(job) %>%
  summarise(mean_balance = mean(balance))
avg_balance_job
## # A tibble: 11 × 2
##    job           mean_balance
##    <chr>                <dbl>
##  1 admin.                953.
##  2 blue-collar           867.
##  3 entrepreneur         1154.
##  4 housemaid             853.
##  5 management           1064.
##  6 retired              1548.
##  7 self-employed        1082.
##  8 services              951.
##  9 student               976.
## 10 technician           1005.
## 11 unemployed            856.
ggplot(avg_balance_job, aes(x = job, y = mean_balance)) +
  geom_bar(stat = "identity", fill = "brown") +
  theme_minimal() +
  labs(title = "Average Balance by Job",
       x = "Job",
       y = "Average Balance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

5.2 Balance vs Deposit

ggplot(bank_transformed, aes(x = deposit, y = balance)) +
  geom_boxplot(fill = "cyan") +
  theme_minimal() +
  labs(title = "Balance Distribution by Deposit Status")