# Getting my working directory
getwd()
## [1] "/Users/ursulapodosenin/Desktop"
# Loading the libraries I will use
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(dplyr)
library(corrplot)
## corrplot 0.95 loaded
install.packages("DataExplorer", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/s4/qp5wvr717qj094rlqhb65ygc0000gn/T//RtmpWQ00Lj/downloaded_packages
library(DataExplorer)
# Loading the dataset
data <- read.csv("/Users/ursulapodosenin/Desktop/bank-full.csv", sep= ";")
head(data)
## age job marital education default balance housing loan contact day
## 1 58 management married tertiary no 2143 yes no unknown 5
## 2 44 technician single secondary no 29 yes no unknown 5
## 3 33 entrepreneur married secondary no 2 yes yes unknown 5
## 4 47 blue-collar married unknown no 1506 yes no unknown 5
## 5 33 unknown single unknown no 1 no no unknown 5
## 6 35 management married tertiary no 231 yes no unknown 5
## month duration campaign pdays previous poutcome y
## 1 may 261 1 -1 0 unknown no
## 2 may 151 1 -1 0 unknown no
## 3 may 76 1 -1 0 unknown no
## 4 may 92 1 -1 0 unknown no
## 5 may 198 1 -1 0 unknown no
## 6 may 139 1 -1 0 unknown no
# Checking for missing values
missing_values <- colSums(is.na(data))
missing_values
## age job marital education default balance housing loan
## 0 0 0 0 0 0 0 0
## contact day month duration campaign pdays previous poutcome
## 0 0 0 0 0 0 0 0
## y
## 0
# Checking the distributions of numerical variables
plot_histogram(data)

# Getting the correlations
numeric_data <- select_if(data, is.numeric)
cor_matrix <- cor(numeric_data, use = "complete.obs")
cor_matrix
## age balance day duration campaign
## age 1.000000000 0.097782739 -0.009120046 -0.004648428 0.004760312
## balance 0.097782739 1.000000000 0.004502585 0.021560380 -0.014578279
## day -0.009120046 0.004502585 1.000000000 -0.030206341 0.162490216
## duration -0.004648428 0.021560380 -0.030206341 1.000000000 -0.084569503
## campaign 0.004760312 -0.014578279 0.162490216 -0.084569503 1.000000000
## pdays -0.023758014 0.003435322 -0.093044074 -0.001564770 -0.088627668
## previous 0.001288319 0.016673637 -0.051710497 0.001203057 -0.032855290
## pdays previous
## age -0.023758014 0.001288319
## balance 0.003435322 0.016673637
## day -0.093044074 -0.051710497
## duration -0.001564770 0.001203057
## campaign -0.088627668 -0.032855290
## pdays 1.000000000 0.454819635
## previous 0.454819635 1.000000000
# Creating a correlation matrix plot
corrplot(cor_matrix, method = "color", tl.cex = 0.7)

# Creating a distribution of categorical variables
plot_bar(data)


# Getting a summary of the numeric data
summary(numeric_data)
## age balance day duration
## Min. :18.00 Min. : -8019 Min. : 1.00 Min. : 0.0
## 1st Qu.:33.00 1st Qu.: 72 1st Qu.: 8.00 1st Qu.: 103.0
## Median :39.00 Median : 448 Median :16.00 Median : 180.0
## Mean :40.94 Mean : 1362 Mean :15.81 Mean : 258.2
## 3rd Qu.:48.00 3rd Qu.: 1428 3rd Qu.:21.00 3rd Qu.: 319.0
## Max. :95.00 Max. :102127 Max. :31.00 Max. :4918.0
## campaign pdays previous
## Min. : 1.000 Min. : -1.0 Min. : 0.0000
## 1st Qu.: 1.000 1st Qu.: -1.0 1st Qu.: 0.0000
## Median : 2.000 Median : -1.0 Median : 0.0000
## Mean : 2.764 Mean : 40.2 Mean : 0.5803
## 3rd Qu.: 3.000 3rd Qu.: -1.0 3rd Qu.: 0.0000
## Max. :63.000 Max. :871.0 Max. :275.0000
# Detecting outliers using IQR method (borrowed from ChatGPT)
outlier_detection <- function(x) {
q1 <- quantile(x, 0.25, na.rm = TRUE)
q3 <- quantile(x, 0.75, na.rm = TRUE)
iqr <- q3 - q1
lower_bound <- q1 - 1.5 * iqr
upper_bound <- q3 + 1.5 * iqr
return(sum(x < lower_bound | x > upper_bound, na.rm = TRUE))
}
outliers <- sapply(numeric_data, outlier_detection)
outliers
## age balance day duration campaign pdays previous
## 487 4729 0 3235 3064 8257 8257
# Preventing scientific notation for the plot and creating the boxplot
options(scipen=10)
boxplot(numeric_data, main="Boxplot of Numeric Variables", las=2)

# Plotting jobs against balaance and age
ggplot(data, aes(x = age, y = balance, color = job)) +
geom_point(alpha = 0.5) +
theme_minimal()

# Getting the balance by job type
ggplot(data, aes(x = job, y = balance)) +
geom_boxplot(fill = "skyblue", alpha = 0.7) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Balance by Job Type", x = "Job Type", y = "Balance")
