library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(gapminder)
library(dplyr)
library(ggthemes)
data_frame = read.csv('C:/Users/prera/OneDrive/Desktop/INFO-I590/bank-full2.csv',header=TRUE, sep = ",")
head(data_frame)
## age job marital education default balance housing loan contact day
## 1 58 management married tertiary no 2143 yes no <NA> 5
## 2 44 technician single secondary no 29 yes no <NA> 5
## 3 33 entrepreneur married secondary no 2 yes yes <NA> 5
## 4 47 blue-collar married <NA> no 1506 yes no <NA> 5
## 5 33 <NA> single <NA> no 1 no no <NA> 5
## 6 35 management married tertiary no 231 yes no <NA> 5
## month duration campaign pdays previous poutcome y
## 1 may 261 1 -1 0 <NA> no
## 2 may 151 1 -1 0 <NA> no
## 3 may 76 1 -1 0 <NA> no
## 4 may 92 1 -1 0 <NA> no
## 5 may 198 1 -1 0 <NA> no
## 6 may 139 1 -1 0 <NA> no
dim(data_frame)
## [1] 45211 17
str(data_frame)
## 'data.frame': 45211 obs. of 17 variables:
## $ age : int 58 44 33 47 33 35 28 42 58 43 ...
## $ job : chr "management" "technician" "entrepreneur" "blue-collar" ...
## $ marital : chr "married" "single" "married" "married" ...
## $ education: chr "tertiary" "secondary" "secondary" NA ...
## $ default : chr "no" "no" "no" "no" ...
## $ balance : int 2143 29 2 1506 1 231 447 2 121 593 ...
## $ housing : chr "yes" "yes" "yes" "yes" ...
## $ loan : chr "no" "no" "yes" "no" ...
## $ contact : chr NA NA NA NA ...
## $ day : int 5 5 5 5 5 5 5 5 5 5 ...
## $ month : chr "may" "may" "may" "may" ...
## $ duration : int 261 151 76 92 198 139 217 380 50 55 ...
## $ campaign : int 1 1 1 1 1 1 1 1 1 1 ...
## $ pdays : int -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : chr NA NA NA NA ...
## $ y : chr "no" "no" "no" "no" ...
summary(data_frame)
## age job marital education
## Min. :18.00 Length:45211 Length:45211 Length:45211
## 1st Qu.:33.00 Class :character Class :character Class :character
## Median :39.00 Mode :character Mode :character Mode :character
## Mean :40.94
## 3rd Qu.:48.00
## Max. :95.00
## default balance housing loan
## Length:45211 Min. : -8019 Length:45211 Length:45211
## Class :character 1st Qu.: 72 Class :character Class :character
## Mode :character Median : 448 Mode :character Mode :character
## Mean : 1362
## 3rd Qu.: 1428
## Max. :102127
## contact day month duration
## Length:45211 Min. : 1.00 Length:45211 Min. : 0.0
## Class :character 1st Qu.: 8.00 Class :character 1st Qu.: 103.0
## Mode :character Median :16.00 Mode :character Median : 180.0
## Mean :15.81 Mean : 258.2
## 3rd Qu.:21.00 3rd Qu.: 319.0
## Max. :31.00 Max. :4918.0
## campaign pdays previous poutcome
## Min. : 1.000 Min. : -1.0 Min. : 0.0000 Length:45211
## 1st Qu.: 1.000 1st Qu.: -1.0 1st Qu.: 0.0000 Class :character
## Median : 2.000 Median : -1.0 Median : 0.0000 Mode :character
## Mean : 2.764 Mean : 40.2 Mean : 0.5803
## 3rd Qu.: 3.000 3rd Qu.: -1.0 3rd Qu.: 0.0000
## Max. :63.000 Max. :871.0 Max. :275.0000
## y
## Length:45211
## Class :character
## Mode :character
##
##
##
I am creating the following columns and adding them to the data frame
has_any_loan -> a logical AND of the ‘loan’ and ‘housing’ columns
above_mean_balance -> yes if the balance is above the average balance
above_mean_duration -> yes if the duration is above the average duration
data_frame_copy <- data_frame |>
filter(!(is.na(housing)|is.na(loan)|is.na(duration)))|>
mutate(has_any_loan = (housing == "yes" | loan == "yes")) |>
mutate(above_mean_balance = (balance > mean(balance))) |>
mutate(above_mean_duration = (duration > mean(duration)))
head(data_frame_copy)
## age job marital education default balance housing loan contact day
## 1 58 management married tertiary no 2143 yes no <NA> 5
## 2 44 technician single secondary no 29 yes no <NA> 5
## 3 33 entrepreneur married secondary no 2 yes yes <NA> 5
## 4 47 blue-collar married <NA> no 1506 yes no <NA> 5
## 5 33 <NA> single <NA> no 1 no no <NA> 5
## 6 35 management married tertiary no 231 yes no <NA> 5
## month duration campaign pdays previous poutcome y has_any_loan
## 1 may 261 1 -1 0 <NA> no TRUE
## 2 may 151 1 -1 0 <NA> no TRUE
## 3 may 76 1 -1 0 <NA> no TRUE
## 4 may 92 1 -1 0 <NA> no TRUE
## 5 may 198 1 -1 0 <NA> no FALSE
## 6 may 139 1 -1 0 <NA> no TRUE
## above_mean_balance above_mean_duration
## 1 TRUE TRUE
## 2 FALSE FALSE
## 3 FALSE FALSE
## 4 TRUE FALSE
## 5 FALSE FALSE
## 6 FALSE FALSE
set1 <- data_frame_copy
set2 <- data_frame_copy
set3 <- data_frame_copy
balance: average yearly balance
has_any_loan: a logical AND of the ‘loan’ and ‘housing’ columns
default: has credit in default?
data_frame_copy$has_any_loan <- as.integer(as.logical(data_frame_copy$has_any_loan))
data_frame_copy$above_mean_balance <- as.integer(as.logical(data_frame_copy$above_mean_balance))
data_frame_copy$above_mean_duration <- as.integer(as.logical(data_frame_copy$above_mean_duration))
data_frame_copy$default <- as.integer(as.character(data_frame_copy$default)=="yes")
data_frame_copy$housing <- as.integer(as.character(data_frame_copy$housing)=="yes")
data_frame_copy$loan <- as.integer(as.character(data_frame_copy$loan)=="yes")
data_frame_copy$y <- as.integer(as.character(data_frame_copy$y)=="yes")
head(data_frame_copy)
## age job marital education default balance housing loan contact day
## 1 58 management married tertiary 0 2143 1 0 <NA> 5
## 2 44 technician single secondary 0 29 1 0 <NA> 5
## 3 33 entrepreneur married secondary 0 2 1 1 <NA> 5
## 4 47 blue-collar married <NA> 0 1506 1 0 <NA> 5
## 5 33 <NA> single <NA> 0 1 0 0 <NA> 5
## 6 35 management married tertiary 0 231 1 0 <NA> 5
## month duration campaign pdays previous poutcome y has_any_loan
## 1 may 261 1 -1 0 <NA> 0 1
## 2 may 151 1 -1 0 <NA> 0 1
## 3 may 76 1 -1 0 <NA> 0 1
## 4 may 92 1 -1 0 <NA> 0 1
## 5 may 198 1 -1 0 <NA> 0 0
## 6 may 139 1 -1 0 <NA> 0 1
## above_mean_balance above_mean_duration
## 1 1 1
## 2 0 0
## 3 0 0
## 4 1 0
## 5 0 0
## 6 0 0
cor(data_frame_copy[, c('balance','default','has_any_loan')], method = "pearson")
## balance default has_any_loan
## balance 1.00000000 -0.06674506 -0.09661643
## default -0.06674506 1.00000000 0.03360040
## has_any_loan -0.09661643 0.03360040 1.00000000
mosaicplot(default~has_any_loan ,data=data_frame_copy,col=c("yellow","lightpink"))
set1 |>
ggplot() +
geom_boxplot(mapping = aes(x = balance, y = default)) +
ggtitle("default vs balance") +
theme_minimal()
proportions <- table(data_frame_copy$default) / length(data_frame_copy$default)
# Calculate standard errors for each proportion
standard_errors <- sqrt(proportions * (1 - proportions) / length(data_frame_copy$default))
# Calculate confidence intervals based on standard errors
z <- qnorm(0.975) # 95% confidence interval
lower_bound <- proportions - z * standard_errors
upper_bound <- proportions + z * standard_errors
# Create a data frame to store results
confidence_intervals_df <- data.frame(
Category = names(proportions),
Proportion = proportions,
Lower_CI = abs(lower_bound),
Upper_CI = abs(upper_bound)
)
# Print the confidence intervals data frame
print(confidence_intervals_df)
## Category Proportion.Var1 Proportion.Freq Lower_CI.Var1 Lower_CI.Freq
## 1 0 0 0.98197341 0 0.98074701
## 2 1 1 0.01802659 1 0.01680019
## Upper_CI.Var1 Upper_CI.Freq
## 1 0 0.98319981
## 2 1 0.01925299
From the above graphs and correlation coefficients we can see that there is no correlation between the variables.
In the ‘0’ category (i.e. the client credit is NOT in default), the sample proportion is 0.98. We are confident that the true population proportion for client having neither loans falls within the interval [0.980,0.983].
In the ‘1’ category (i.e. the client credit is in default ), the sample proportion is 0.01. We are confident that the true population proportion for client having neither loans falls within the interval [0.016,0.019].
balance: average yearly balance
above_mean_duration: yes if the duration is above the average duration
y: has the client subscribed a term deposit
cor(data_frame_copy[, c('balance','above_mean_duration','y')], method = "pearson")
## balance above_mean_duration y
## balance 1.00000000 0.01567268 0.05283841
## above_mean_duration 0.01567268 1.00000000 0.30216394
## y 0.05283841 0.30216394 1.00000000
mosaicplot(above_mean_duration~y,data=data_frame_copy,col=c("lightblue","lightpink"))
set2 |>
ggplot() +
geom_boxplot(mapping = aes(x = y, y = balance)) +
ggtitle("balance vs y") +
theme_minimal()
proportions <- table(data_frame_copy$y) / length(data_frame_copy$y)
# Calculate standard errors for each proportion
standard_errors <- sqrt(proportions * (1 - proportions) / length(data_frame_copy$y))
# Calculate confidence intervals based on standard errors
z <- qnorm(0.975) # 95% confidence interval
lower_bound <- proportions - z * standard_errors
upper_bound <- proportions + z * standard_errors
# Create a data frame to store results
confidence_intervals_df <- data.frame(
Category = names(proportions),
Proportion = proportions,
Lower_CI = abs(lower_bound),
Upper_CI = abs(upper_bound)
)
# Print the confidence intervals data frame
print(confidence_intervals_df)
## Category Proportion.Var1 Proportion.Freq Lower_CI.Var1 Lower_CI.Freq
## 1 0 0 0.8830152 0 0.8800526
## 2 1 1 0.1169848 1 0.1140222
## Upper_CI.Var1 Upper_CI.Freq
## 1 0 0.8859778
## 2 1 0.1199474
From the above graphs and correlation coefficients we can see that there is a correlation between the variables ‘y’ and ‘above_mean_duration’
In the ‘0’ category (i.e. the client has subscribed a term deposit), the sample proportion is 0.88. We are confident that the true population proportion for client having neither loans falls within the interval [0.880,0.885].
In the ‘1’ category (i.e. the client has not subscribed a term deposit), the sample proportion is 0.11. We are confident that the true population proportion for client having neither loans falls within the interval [0.114,0.118].
housing: has housing loan?
loan: has personal loan?
has_any_loan: a logical AND of the ‘loan’ and ‘housing’ columns
cor(data_frame_copy[, c('has_any_loan','loan','housing')], method = "pearson")
## has_any_loan loan housing
## has_any_loan 1.0000000 0.34234777 0.87676862
## loan 0.3423478 1.00000000 0.04132287
## housing 0.8767686 0.04132287 1.00000000
A mosaic plot is created based on the frequency of each category in the variables
mosaicplot(has_any_loan~loan,data=data_frame_copy,col=c("Blue","Red"))
mosaicplot(has_any_loan~housing,data=data_frame_copy,col=c("Blue","Red"))
From the above graphs and correlation coefficients we can see that there is a high correlation between the variables ‘housing’ and ‘has_any_loan’.
proportions <- table(data_frame_copy$has_any_loan) / length(data_frame_copy$has_any_loan)
# Calculate standard errors for each proportion
standard_errors <- sqrt(proportions * (1 - proportions) / length(data_frame_copy$has_any_loan))
# Calculate confidence intervals based on standard errors
z <- qnorm(0.975) # 95% confidence interval
lower_bound <- proportions - z * standard_errors
upper_bound <- proportions + z * standard_errors
# Create a data frame to store results
confidence_intervals_df <- data.frame(
Category = names(proportions),
Proportion = proportions,
Lower_CI = abs(lower_bound),
Upper_CI = abs(upper_bound)
)
# Print the confidence intervals data frame
print(confidence_intervals_df)
## Category Proportion.Var1 Proportion.Freq Lower_CI.Var1 Lower_CI.Freq
## 1 0 0 0.3805269 0 0.3760515
## 2 1 1 0.6194731 1 0.6149978
## Upper_CI.Var1 Upper_CI.Freq
## 1 0 0.3850022
## 2 1 0.6239485
The intervals give you a range within which you can be reasonably confident that the true population proportion lies.
In the ‘0’ category (i.e. the client has neither loans), the sample proportion is 0.38. We are confident that the true population proportion for client having neither loans falls within the interval [0.37,0.38].
In the ‘1’ category (i.e. the client has either of the loans), the sample proportion is 0.61. We are confident that the true population proportion for client having neither loans falls within the interval [0.61,0.62].