library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
bank <- read.csv('/Users/regishamanandhar/Documents/DCS 402/bank.csv')
str(bank)
## 'data.frame': 11162 obs. of 17 variables:
## $ age : int 59 56 41 55 54 42 56 60 37 28 ...
## $ job : chr "admin." "admin." "technician" "services" ...
## $ marital : chr "married" "married" "married" "married" ...
## $ education: chr "secondary" "secondary" "secondary" "secondary" ...
## $ default : chr "no" "no" "no" "no" ...
## $ balance : int 2343 45 1270 2476 184 0 830 545 1 5090 ...
## $ housing : chr "yes" "no" "yes" "yes" ...
## $ loan : chr "no" "no" "no" "no" ...
## $ contact : chr "unknown" "unknown" "unknown" "unknown" ...
## $ day : int 5 5 5 5 5 5 6 6 6 6 ...
## $ month : chr "may" "may" "may" "may" ...
## $ duration : int 1042 1467 1389 579 673 562 1201 1030 608 1297 ...
## $ campaign : int 1 1 1 1 2 2 1 1 1 3 ...
## $ pdays : int -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : chr "unknown" "unknown" "unknown" "unknown" ...
## $ deposit : chr "yes" "yes" "yes" "yes" ...
summary(bank)
## age job marital education
## Min. :18.00 Length:11162 Length:11162 Length:11162
## 1st Qu.:32.00 Class :character Class :character Class :character
## Median :39.00 Mode :character Mode :character Mode :character
## Mean :41.23
## 3rd Qu.:49.00
## Max. :95.00
## default balance housing loan
## Length:11162 Min. :-6847 Length:11162 Length:11162
## Class :character 1st Qu.: 122 Class :character Class :character
## Mode :character Median : 550 Mode :character Mode :character
## Mean : 1529
## 3rd Qu.: 1708
## Max. :81204
## contact day month duration
## Length:11162 Min. : 1.00 Length:11162 Min. : 2
## Class :character 1st Qu.: 8.00 Class :character 1st Qu.: 138
## Mode :character Median :15.00 Mode :character Median : 255
## Mean :15.66 Mean : 372
## 3rd Qu.:22.00 3rd Qu.: 496
## Max. :31.00 Max. :3881
## campaign pdays previous poutcome
## Min. : 1.000 Min. : -1.00 Min. : 0.0000 Length:11162
## 1st Qu.: 1.000 1st Qu.: -1.00 1st Qu.: 0.0000 Class :character
## Median : 2.000 Median : -1.00 Median : 0.0000 Mode :character
## Mean : 2.508 Mean : 51.33 Mean : 0.8326
## 3rd Qu.: 3.000 3rd Qu.: 20.75 3rd Qu.: 1.0000
## Max. :63.000 Max. :854.00 Max. :58.0000
## deposit
## Length:11162
## Class :character
## Mode :character
##
##
##
missing_values <- colSums(is.na(bank))
missing_df <- data.frame(
column = names(missing_values),
missing_count = missing_values
)
missing_df
## column missing_count
## age age 0
## job job 0
## marital marital 0
## education education 0
## default default 0
## balance balance 0
## housing housing 0
## loan loan 0
## contact contact 0
## day day 0
## month month 0
## duration duration 0
## campaign campaign 0
## pdays pdays 0
## previous previous 0
## poutcome poutcome 0
## deposit deposit 0
bank[bank == "unknown"] <- NA
missing_after <- colSums(is.na(bank))
missing_df_after <- data.frame(
column = names(missing_after),
missing_count = missing_after
)
missing_df_after
## column missing_count
## age age 0
## job job 70
## marital marital 0
## education education 497
## default default 0
## balance balance 0
## housing housing 0
## loan loan 0
## contact contact 2346
## day day 0
## month month 0
## duration duration 0
## campaign campaign 0
## pdays pdays 0
## previous previous 0
## poutcome poutcome 8326
## deposit deposit 0
ggplot(missing_df_after, aes(x = column, y = missing_count)) +
geom_bar(stat = "identity", fill = "orange") +
theme_minimal() +
labs(title = "Missing Values After Handling 'unknown'",
x = "Columns",
y = "Missing Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
bank_clean <- na.omit(bank)
Q1 <- quantile(bank_clean$balance, 0.25)
Q3 <- quantile(bank_clean$balance, 0.75)
IQR_value <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
outliers <- bank_clean %>%
filter(balance < lower_bound | balance > upper_bound)
head (outliers,10)
## age job marital education default balance housing loan contact day
## 1 37 technician married secondary no 5115 yes no cellular 17
## 2 33 technician married tertiary no 6843 no no cellular 20
## 3 50 blue-collar married primary no 12519 yes no cellular 21
## 4 59 management married tertiary no 7049 no no cellular 21
## 5 31 management married secondary no 8629 yes no cellular 21
## 6 31 management single tertiary no 12857 yes no cellular 6
## 7 40 blue-collar married secondary no 5060 no no cellular 10
## 8 30 management single tertiary no 5561 yes no cellular 27
## 9 30 management married tertiary no 8089 yes no cellular 18
## 10 38 management single tertiary no 6158 yes no cellular 15
## month duration campaign pdays previous poutcome deposit
## 1 nov 1210 2 171 4 failure yes
## 2 nov 755 1 100 10 other yes
## 3 nov 615 3 34 1 failure yes
## 4 nov 530 1 163 2 failure yes
## 5 nov 957 1 184 2 failure yes
## 6 feb 158 1 92 1 success yes
## 7 feb 154 2 93 1 success yes
## 8 feb 195 1 100 1 success yes
## 9 mar 232 2 294 2 failure yes
## 10 apr 139 5 145 9 failure yes
ggplot(bank_clean, aes(y = balance)) +
geom_boxplot(fill = "red") +
theme_minimal() +
labs(title = "Outliers in Balance (Before Filtering)")
# 3.5 FILTERING OUTLIERS
bank_no_outliers <- bank_clean %>%
filter(balance >= lower_bound & balance <= upper_bound)
head (bank_no_outliers, 10)
## age job marital education default balance housing loan contact
## 1 42 admin. single secondary no -247 yes yes telephone
## 2 33 services married secondary no 3444 yes no telephone
## 3 53 retired married tertiary no 2269 no no cellular
## 4 45 entrepreneur married secondary no 781 no yes cellular
## 5 34 management single tertiary no 1494 yes no cellular
## 6 46 management married tertiary no 0 no no cellular
## 7 43 management married tertiary no 1429 yes no cellular
## 8 33 technician single tertiary no 149 yes no cellular
## 9 46 unemployed divorced secondary no 3354 yes no cellular
## 10 38 blue-collar married primary no 190 yes no telephone
## day month duration campaign pdays previous poutcome deposit
## 1 21 oct 519 1 166 1 other yes
## 2 21 oct 144 1 91 4 failure yes
## 3 17 nov 1091 2 150 1 success yes
## 4 17 nov 652 2 126 2 failure yes
## 5 18 nov 596 1 182 1 other yes
## 6 18 nov 716 2 110 3 other yes
## 7 19 nov 1015 1 198 2 other yes
## 8 19 nov 424 2 182 1 other yes
## 9 19 nov 522 1 174 1 success yes
## 10 19 nov 623 1 175 1 other yes
ggplot(bank_no_outliers, aes(y = balance)) +
geom_boxplot(fill = "red") +
theme_minimal() +
labs(title = "Balance After Outlier Removal")
bank_transformed <- bank_no_outliers %>%
mutate(
housing_numeric = ifelse(housing == "yes", 1, 0),
loan_numeric = ifelse(loan == "yes", 1, 0),
deposit_numeric = ifelse(deposit == "yes", 1, 0),
job_numeric = as.numeric(as.factor(job)),
education_numeric = as.numeric(as.factor(education)),
marital_numeric = as.numeric(as.factor(marital))
)
str(bank_transformed)
## 'data.frame': 2444 obs. of 23 variables:
## $ age : int 42 33 53 45 34 46 43 33 46 38 ...
## $ job : chr "admin." "services" "retired" "entrepreneur" ...
## $ marital : chr "single" "married" "married" "married" ...
## $ education : chr "secondary" "secondary" "tertiary" "secondary" ...
## $ default : chr "no" "no" "no" "no" ...
## $ balance : int -247 3444 2269 781 1494 0 1429 149 3354 190 ...
## $ housing : chr "yes" "yes" "no" "no" ...
## $ loan : chr "yes" "no" "no" "yes" ...
## $ contact : chr "telephone" "telephone" "cellular" "cellular" ...
## $ day : int 21 21 17 17 18 18 19 19 19 19 ...
## $ month : chr "oct" "oct" "nov" "nov" ...
## $ duration : int 519 144 1091 652 596 716 1015 424 522 623 ...
## $ campaign : int 1 1 2 2 1 2 1 2 1 1 ...
## $ pdays : int 166 91 150 126 182 110 198 182 174 175 ...
## $ previous : int 1 4 1 2 1 3 2 1 1 1 ...
## $ poutcome : chr "other" "failure" "success" "failure" ...
## $ deposit : chr "yes" "yes" "yes" "yes" ...
## $ housing_numeric : num 1 1 0 0 1 0 1 1 1 1 ...
## $ loan_numeric : num 1 0 0 1 0 0 0 0 0 0 ...
## $ deposit_numeric : num 1 1 1 1 1 1 1 1 1 1 ...
## $ job_numeric : num 1 8 6 3 5 5 5 10 11 2 ...
## $ education_numeric: num 2 2 3 2 3 3 3 3 2 1 ...
## $ marital_numeric : num 3 2 2 2 3 2 2 3 1 2 ...
## - attr(*, "na.action")= 'omit' Named int [1:8487] 1 2 3 4 5 6 7 8 9 10 ...
## ..- attr(*, "names")= chr [1:8487] "1" "2" "3" "4" ...
ggplot(bank_no_outliers, aes(x = deposit)) +
geom_bar(fill = "purple") +
theme_minimal() +
labs(title = "Deposit (Categorical Representation)",
x = "Deposit (Yes/No)",
y = "Count")
# Example 2: Deposit After Transformation (Numeric)
ggplot(bank_transformed, aes(x = factor(deposit_numeric))) +
geom_bar(fill = "orange") +
theme_minimal() +
labs(title = "Deposit After Numeric Encoding",
x = "Deposit (0 = No, 1 = Yes)",
y = "Count")
# Example 3: Job Before Transformation
ggplot(bank_no_outliers, aes(x = job)) +
geom_bar(fill = "darkgreen") +
theme_minimal() +
labs(title = "Job Categories (Original)",
x = "Job",
y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Example 4: Job After Numeric Encoding
ggplot(bank_transformed, aes(x = factor(job_numeric))) +
geom_bar(fill = "skyblue") +
theme_minimal() +
labs(title = "Job Categories After Numeric Encoding",
x = "Encoded Job Category",
y = "Count")
avg_balance_job <- bank_transformed %>%
group_by(job) %>%
summarise(mean_balance = mean(balance))
avg_balance_job
## # A tibble: 11 × 2
## job mean_balance
## <chr> <dbl>
## 1 admin. 953.
## 2 blue-collar 867.
## 3 entrepreneur 1154.
## 4 housemaid 853.
## 5 management 1064.
## 6 retired 1548.
## 7 self-employed 1082.
## 8 services 951.
## 9 student 976.
## 10 technician 1005.
## 11 unemployed 856.
ggplot(avg_balance_job, aes(x = job, y = mean_balance)) +
geom_bar(stat = "identity", fill = "brown") +
theme_minimal() +
labs(title = "Average Balance by Job",
x = "Job",
y = "Average Balance") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(bank_transformed, aes(x = deposit, y = balance)) +
geom_boxplot(fill = "cyan") +
theme_minimal() +
labs(title = "Balance Distribution by Deposit Status")