# Getting my working directory 
getwd()
## [1] "/Users/ursulapodosenin/Desktop"
# Loading the libraries I will use
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(dplyr)
library(corrplot)
## corrplot 0.95 loaded
install.packages("DataExplorer", repos = "http://cran.us.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/s4/qp5wvr717qj094rlqhb65ygc0000gn/T//RtmpWQ00Lj/downloaded_packages
library(DataExplorer)
# Loading the dataset
data <- read.csv("/Users/ursulapodosenin/Desktop/bank-full.csv", sep= ";")
head(data)
##   age          job marital education default balance housing loan contact day
## 1  58   management married  tertiary      no    2143     yes   no unknown   5
## 2  44   technician  single secondary      no      29     yes   no unknown   5
## 3  33 entrepreneur married secondary      no       2     yes  yes unknown   5
## 4  47  blue-collar married   unknown      no    1506     yes   no unknown   5
## 5  33      unknown  single   unknown      no       1      no   no unknown   5
## 6  35   management married  tertiary      no     231     yes   no unknown   5
##   month duration campaign pdays previous poutcome  y
## 1   may      261        1    -1        0  unknown no
## 2   may      151        1    -1        0  unknown no
## 3   may       76        1    -1        0  unknown no
## 4   may       92        1    -1        0  unknown no
## 5   may      198        1    -1        0  unknown no
## 6   may      139        1    -1        0  unknown no
# Checking for missing values
missing_values <- colSums(is.na(data))
missing_values
##       age       job   marital education   default   balance   housing      loan 
##         0         0         0         0         0         0         0         0 
##   contact       day     month  duration  campaign     pdays  previous  poutcome 
##         0         0         0         0         0         0         0         0 
##         y 
##         0
# Checking the distributions of numerical variables
plot_histogram(data)

# Getting the correlations
numeric_data <- select_if(data, is.numeric)
cor_matrix <- cor(numeric_data, use = "complete.obs")
cor_matrix
##                   age      balance          day     duration     campaign
## age       1.000000000  0.097782739 -0.009120046 -0.004648428  0.004760312
## balance   0.097782739  1.000000000  0.004502585  0.021560380 -0.014578279
## day      -0.009120046  0.004502585  1.000000000 -0.030206341  0.162490216
## duration -0.004648428  0.021560380 -0.030206341  1.000000000 -0.084569503
## campaign  0.004760312 -0.014578279  0.162490216 -0.084569503  1.000000000
## pdays    -0.023758014  0.003435322 -0.093044074 -0.001564770 -0.088627668
## previous  0.001288319  0.016673637 -0.051710497  0.001203057 -0.032855290
##                 pdays     previous
## age      -0.023758014  0.001288319
## balance   0.003435322  0.016673637
## day      -0.093044074 -0.051710497
## duration -0.001564770  0.001203057
## campaign -0.088627668 -0.032855290
## pdays     1.000000000  0.454819635
## previous  0.454819635  1.000000000
# Creating a correlation matrix plot
corrplot(cor_matrix, method = "color", tl.cex = 0.7)

# Creating a distribution of categorical variables
plot_bar(data)

# Getting a summary of the numeric data
summary(numeric_data)
##       age           balance            day           duration     
##  Min.   :18.00   Min.   : -8019   Min.   : 1.00   Min.   :   0.0  
##  1st Qu.:33.00   1st Qu.:    72   1st Qu.: 8.00   1st Qu.: 103.0  
##  Median :39.00   Median :   448   Median :16.00   Median : 180.0  
##  Mean   :40.94   Mean   :  1362   Mean   :15.81   Mean   : 258.2  
##  3rd Qu.:48.00   3rd Qu.:  1428   3rd Qu.:21.00   3rd Qu.: 319.0  
##  Max.   :95.00   Max.   :102127   Max.   :31.00   Max.   :4918.0  
##     campaign          pdays          previous       
##  Min.   : 1.000   Min.   : -1.0   Min.   :  0.0000  
##  1st Qu.: 1.000   1st Qu.: -1.0   1st Qu.:  0.0000  
##  Median : 2.000   Median : -1.0   Median :  0.0000  
##  Mean   : 2.764   Mean   : 40.2   Mean   :  0.5803  
##  3rd Qu.: 3.000   3rd Qu.: -1.0   3rd Qu.:  0.0000  
##  Max.   :63.000   Max.   :871.0   Max.   :275.0000
# Detecting outliers using IQR method (borrowed from ChatGPT)
outlier_detection <- function(x) {
  q1 <- quantile(x, 0.25, na.rm = TRUE)
  q3 <- quantile(x, 0.75, na.rm = TRUE)
  iqr <- q3 - q1
  lower_bound <- q1 - 1.5 * iqr
  upper_bound <- q3 + 1.5 * iqr
  return(sum(x < lower_bound | x > upper_bound, na.rm = TRUE))
}

outliers <- sapply(numeric_data, outlier_detection)
outliers
##      age  balance      day duration campaign    pdays previous 
##      487     4729        0     3235     3064     8257     8257
# Preventing scientific notation for the plot and creating the boxplot
options(scipen=10)
boxplot(numeric_data, main="Boxplot of Numeric Variables", las=2)

# Plotting jobs against balaance and age 
ggplot(data, aes(x = age, y = balance, color = job)) +
  geom_point(alpha = 0.5) +
  theme_minimal()

# Getting the balance by job type
ggplot(data, aes(x = job, y = balance)) +
  geom_boxplot(fill = "skyblue", alpha = 0.7) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Balance by Job Type", x = "Job Type", y = "Balance")