library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#path to the file = 'C:/Users/prera/OneDrive/Desktop/INFO-I590/bank-full2.csv'
data_frame = read.csv('C:/Users/prera/OneDrive/Desktop/INFO-I590/bank-full2.csv',header=TRUE, sep = ",")
head(data_frame)
## age job marital education default balance housing loan contact day
## 1 58 management married tertiary no 2143 yes no <NA> 5
## 2 44 technician single secondary no 29 yes no <NA> 5
## 3 33 entrepreneur married secondary no 2 yes yes <NA> 5
## 4 47 blue-collar married <NA> no 1506 yes no <NA> 5
## 5 33 <NA> single <NA> no 1 no no <NA> 5
## 6 35 management married tertiary no 231 yes no <NA> 5
## month duration campaign pdays previous poutcome y
## 1 may 261 1 -1 0 <NA> no
## 2 may 151 1 -1 0 <NA> no
## 3 may 76 1 -1 0 <NA> no
## 4 may 92 1 -1 0 <NA> no
## 5 may 198 1 -1 0 <NA> no
## 6 may 139 1 -1 0 <NA> no
dim(data_frame)
## [1] 45211 17
str(data_frame)
## 'data.frame': 45211 obs. of 17 variables:
## $ age : int 58 44 33 47 33 35 28 42 58 43 ...
## $ job : chr "management" "technician" "entrepreneur" "blue-collar" ...
## $ marital : chr "married" "single" "married" "married" ...
## $ education: chr "tertiary" "secondary" "secondary" NA ...
## $ default : chr "no" "no" "no" "no" ...
## $ balance : int 2143 29 2 1506 1 231 447 2 121 593 ...
## $ housing : chr "yes" "yes" "yes" "yes" ...
## $ loan : chr "no" "no" "yes" "no" ...
## $ contact : chr NA NA NA NA ...
## $ day : int 5 5 5 5 5 5 5 5 5 5 ...
## $ month : chr "may" "may" "may" "may" ...
## $ duration : int 261 151 76 92 198 139 217 380 50 55 ...
## $ campaign : int 1 1 1 1 1 1 1 1 1 1 ...
## $ pdays : int -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : chr NA NA NA NA ...
## $ y : chr "no" "no" "no" "no" ...
For the first grouping, I want to understand how the different levels of education are affecting the balance of the client.
data_frame_education <- data_frame|>
filter(!(is.na(education)))|>
group_by(education) |>
summarise(category_mean_balance = mean(balance,na.rm=TRUE), size=n())
head(data_frame_education)
## # A tibble: 3 × 3
## education category_mean_balance size
## <chr> <dbl> <int>
## 1 primary 1251. 6851
## 2 secondary 1155. 23202
## 3 tertiary 1758. 13301
# Create a new column 'category_mean_balance' based on the condition
data_frame_education_balance <- data_frame |>
left_join(data_frame_education, by = "education")
data_frame_education_balance <- data_frame_education_balance[,-19]
mean(data_frame$balance)
## [1] 1362.272
data_frame_education_balance <- data_frame_education_balance |>
mutate( below_mean = ifelse(balance < category_mean_balance, "yes", "No"))
head(data_frame_education_balance)
## age job marital education default balance housing loan contact day
## 1 58 management married tertiary no 2143 yes no <NA> 5
## 2 44 technician single secondary no 29 yes no <NA> 5
## 3 33 entrepreneur married secondary no 2 yes yes <NA> 5
## 4 47 blue-collar married <NA> no 1506 yes no <NA> 5
## 5 33 <NA> single <NA> no 1 no no <NA> 5
## 6 35 management married tertiary no 231 yes no <NA> 5
## month duration campaign pdays previous poutcome y category_mean_balance
## 1 may 261 1 -1 0 <NA> no 1758.416
## 2 may 151 1 -1 0 <NA> no 1154.881
## 3 may 76 1 -1 0 <NA> no 1154.881
## 4 may 92 1 -1 0 <NA> no NA
## 5 may 198 1 -1 0 <NA> no NA
## 6 may 139 1 -1 0 <NA> no 1758.416
## below_mean
## 1 No
## 2 yes
## 3 yes
## 4 <NA>
## 5 <NA>
## 6 yes
data_frame_education <- data_frame_education_balance|>
filter(!(is.na(education)))|>
group_by(education,below_mean) |>
summarise(category_mean_balance = mean(balance,na.rm=TRUE), size=n())
## `summarise()` has grouped output by 'education'. You can override using the
## `.groups` argument.
data_frame_education
## # A tibble: 6 × 4
## # Groups: education [3]
## education below_mean category_mean_balance size
## <chr> <chr> <dbl> <int>
## 1 primary No 3880. 1856
## 2 primary yes 274. 4995
## 3 secondary No 3664. 6109
## 4 secondary yes 258. 17093
## 5 tertiary No 5612. 3406
## 6 tertiary yes 432. 9895
It is interesting to see that, although there are thrice as many people with secondary education when compared to the number of people with primary education the average balance of the group with primary education is higher than that of the secondary education.
The graph shows the the average balance for each education group
p <- data_frame_education |>
ggplot(aes(x = education, y=category_mean_balance, fill =below_mean ) )+
geom_bar(position = "dodge", stat = "identity") +
theme_minimal() +
scale_fill_brewer(palette = 'Pastel1')
p
The number of people in each category.
p <- data_frame_education |>
ggplot(aes(x = education, y=size, fill =below_mean ) )+
geom_bar(position = "dodge", stat = "identity") +
theme_minimal() +
scale_fill_brewer(palette = 'Set3')
p
total <- 1856+4995+6109+17093+3406+9895
primary_education_below_average <- 4995/total
primary_education_below_average
## [1] 0.1152143
primary_education_above_average <- 1856/total
primary_education_above_average
## [1] 0.04281035
secondary_education_below_average <- 17093/total
secondary_education_below_average
## [1] 0.3942658
secondary_education_above_average <- 6109/total
secondary_education_above_average
## [1] 0.1409097
tertiary_education_below_average <- 9895/total
tertiary_education_below_average
## [1] 0.2282373
tertiary_education_above_average <- 3406/total
tertiary_education_above_average
## [1] 0.07856253
# Pie Chart with Percentages
slices <- c(primary_education_below_average,primary_education_above_average, secondary_education_below_average, secondary_education_above_average, tertiary_education_below_average, tertiary_education_above_average)
lbls <- c("primary_below_average","primary_above_average", "secondary_below_average", "secondary_above_average", "tertiary_below_average", "tertiary_above_average")
pct <- round(slices/sum(slices)*100)
lbls <- paste(lbls, pct)
# add percents to labels
lbls <- paste(lbls,"%",sep="") # ad % to labels
pie(slices,labels = lbls, main="Pie Chart")
data_frame_education_balance <- data_frame_education_balance %>%
mutate(Anomaly = ifelse(education == 'primary' & below_mean=='No', "Anomaly", "Not an anomaly"))
head(data_frame_education_balance)
## age job marital education default balance housing loan contact day
## 1 58 management married tertiary no 2143 yes no <NA> 5
## 2 44 technician single secondary no 29 yes no <NA> 5
## 3 33 entrepreneur married secondary no 2 yes yes <NA> 5
## 4 47 blue-collar married <NA> no 1506 yes no <NA> 5
## 5 33 <NA> single <NA> no 1 no no <NA> 5
## 6 35 management married tertiary no 231 yes no <NA> 5
## month duration campaign pdays previous poutcome y category_mean_balance
## 1 may 261 1 -1 0 <NA> no 1758.416
## 2 may 151 1 -1 0 <NA> no 1154.881
## 3 may 76 1 -1 0 <NA> no 1154.881
## 4 may 92 1 -1 0 <NA> no NA
## 5 may 198 1 -1 0 <NA> no NA
## 6 may 139 1 -1 0 <NA> no 1758.416
## below_mean Anomaly
## 1 No Not an anomaly
## 2 yes Not an anomaly
## 3 yes Not an anomaly
## 4 <NA> <NA>
## 5 <NA> <NA>
## 6 yes Not an anomaly
Hypothesis - people with higher education will have a greater balance
Answer to the hypothesis - The tertiary education group has the highest average balance.
housing: has housing loan?
loan: has personal loan?
Hypothesis - people with a housing and personal loan will have a lower balance
For the second grouping, I want to understand how the balance of the
client having a housing and personal loan are related.
data_frame_loan <- data_frame|>
filter(!(is.na(housing)&is.na(loan)))|>
group_by(housing,loan) |>
summarise(category_mean_balance = mean(balance,na.rm=TRUE), category_max_balance = max(balance,na.rm=TRUE), category_min_balance = min(balance,na.rm=TRUE), size=n())
## `summarise()` has grouped output by 'housing'. You can override using the
## `.groups` argument.
data_frame_loan
## # A tibble: 4 × 6
## # Groups: housing [2]
## housing loan category_mean_balance category_max_balance category_min_balance
## <chr> <chr> <dbl> <int> <int>
## 1 no no 1738. 102127 -1968
## 2 no yes 753. 51439 -8019
## 3 yes no 1256. 58544 -4057
## 4 yes yes 789. 34230 -3313
## # ℹ 1 more variable: size <int>
data_frame_loan$housing_personal <- paste(data_frame_loan$housing, "-", data_frame_loan$loan)
head(data_frame_loan)
## # A tibble: 4 × 7
## # Groups: housing [2]
## housing loan category_mean_balance category_max_balance category_min_balance
## <chr> <chr> <dbl> <int> <int>
## 1 no no 1738. 102127 -1968
## 2 no yes 753. 51439 -8019
## 3 yes no 1256. 58544 -4057
## 4 yes yes 789. 34230 -3313
## # ℹ 2 more variables: size <int>, housing_personal <chr>
p <- data_frame_loan |>
ggplot(aes(x = housing_personal, y=category_mean_balance) )+
geom_bar(fill="lightblue",position = "dodge", stat = "identity") +
theme_minimal() +
scale_fill_brewer(palette = 'Set3')
p
p <- data_frame_loan |>
ggplot(aes(x = housing_personal, y=category_max_balance) )+
geom_bar(fill="lightgreen",position = "dodge", stat = "identity") +
theme_minimal() +
scale_fill_brewer(palette = 'Set3')
p
p <- data_frame_loan |>
ggplot(aes(x = housing_personal, y=category_min_balance) )+
geom_bar(fill="yellow",position = "dodge", stat = "identity") +
theme_minimal() +
scale_fill_brewer(palette = 'Set3')
p
total <- 17204+2877+20763+4367
no_loans<- 17204/total
no_loans
## [1] 0.3805269
only_personal <- 2877/total
only_personal
## [1] 0.06363496
only_housing <- 20763/total
only_housing
## [1] 0.4592466
both_loans <- 4367/total
both_loans
## [1] 0.09659154
# Pie Chart with Percentages
slices <- c(no_loans,only_personal, only_housing, both_loans)
lbls <- c("no_loans","only_personal", "only_housing", "both_loans")
pct <- round(slices/sum(slices)*100)
lbls <- paste(lbls, pct)
# add percents to labels
lbls <- paste(lbls,"%",sep="") # ad % to labels
pie(slices,labels = lbls, main="Pie Chart")
data_frame_housing_personal <- data_frame_education_balance |>
mutate( Anomaly = ifelse((housing == "no" & loan == "yes" ), "Anomaly", "Not an Anomaly"))
head(data_frame_housing_personal)
## age job marital education default balance housing loan contact day
## 1 58 management married tertiary no 2143 yes no <NA> 5
## 2 44 technician single secondary no 29 yes no <NA> 5
## 3 33 entrepreneur married secondary no 2 yes yes <NA> 5
## 4 47 blue-collar married <NA> no 1506 yes no <NA> 5
## 5 33 <NA> single <NA> no 1 no no <NA> 5
## 6 35 management married tertiary no 231 yes no <NA> 5
## month duration campaign pdays previous poutcome y category_mean_balance
## 1 may 261 1 -1 0 <NA> no 1758.416
## 2 may 151 1 -1 0 <NA> no 1154.881
## 3 may 76 1 -1 0 <NA> no 1154.881
## 4 may 92 1 -1 0 <NA> no NA
## 5 may 198 1 -1 0 <NA> no NA
## 6 may 139 1 -1 0 <NA> no 1758.416
## below_mean Anomaly
## 1 No Not an Anomaly
## 2 yes Not an Anomaly
## 3 yes Not an Anomaly
## 4 <NA> Not an Anomaly
## 5 <NA> Not an Anomaly
## 6 yes Not an Anomaly
Inference - It can be inferred that the maximum balance of the category with both loans is the least among all the categories and the highest belongs to the category with no loans.
default: has credit in default?
For the third grouping, I want to understand how the balance of the
client having a housing and personal loan are related.
data_frame_default <- data_frame|>
filter(!(is.na(default)))|>
group_by(default) |>
summarise(category_mean_balance = mean(balance,na.rm=TRUE), category_max_balance = max(balance,na.rm=TRUE), category_min_balance = min(balance,na.rm=TRUE), size=n())
data_frame_default
## # A tibble: 2 × 5
## default category_mean_balance category_max_balance category_min_balance size
## <chr> <dbl> <int> <int> <int>
## 1 no 1390. 102127 -4057 44396
## 2 yes -138. 16486 -8019 815
p <- data_frame_default |>
ggplot(aes(x = default, y=category_mean_balance) )+
geom_bar(fill="purple",position = "dodge", stat = "identity") +
theme_minimal()
p
total <- 44396+815
no_default<- 44396/total
no_default
## [1] 0.9819734
default_yes<- 815/total
default_yes
## [1] 0.01802659
# Pie Chart with Percentages
slices <- c(default_yes,no_default)
lbls <- c("default_yes","no_default")
pct <- round(slices/sum(slices)*100)
lbls <- paste(lbls, pct)
# add percents to labels
lbls <- paste(lbls,"%",sep="") # ad % to labels
pie(slices,labels = lbls, main="Pie Chart")
count(data_frame,default,housing,loan)
## default housing loan n
## 1 no no no 16992
## 2 no no yes 2709
## 3 no yes no 20461
## 4 no yes yes 4234
## 5 yes no no 212
## 6 yes no yes 168
## 7 yes yes no 302
## 8 yes yes yes 133
I expected the row with the values (yes,yes,yes) to not show up, because logically speaking if your credit is in default you should not be eligible for any loans.
data_frame_combinations <- data_frame|>
filter(!(is.na(housing)&is.na(loan)&is.na(default)))|>
group_by(housing,loan, default) |>
summarise(size=n())
## `summarise()` has grouped output by 'housing', 'loan'. You can override using
## the `.groups` argument.
data_frame_combinations$default_housing_loan <- paste(data_frame_combinations$default,"_", data_frame_combinations$housing, "_", data_frame_combinations$loan)
data_frame_combinations
## # A tibble: 8 × 5
## # Groups: housing, loan [4]
## housing loan default size default_housing_loan
## <chr> <chr> <chr> <int> <chr>
## 1 no no no 16992 no _ no _ no
## 2 no no yes 212 yes _ no _ no
## 3 no yes no 2709 no _ no _ yes
## 4 no yes yes 168 yes _ no _ yes
## 5 yes no no 20461 no _ yes _ no
## 6 yes no yes 302 yes _ yes _ no
## 7 yes yes no 4234 no _ yes _ yes
## 8 yes yes yes 133 yes _ yes _ yes
total <- 16992+212+2709+168+20461+302+4234+133
# Pie Chart with Percentages
slices <- c(16992/total,212/total,2709/total,168/total,20461/total,302/total,4234/total,133/total)
lbls <- c("no _ no _ no","yes _ no _ no", "no _ no _ yes", "yes _ no _ yes", "no _ yes _ no", "yes _ yes _ no","no _ yes _ yes","yes _ yes _ yes")
pct <- round(slices/sum(slices)*100)
lbls <- paste(lbls, pct)
# add percents to labels
lbls <- paste(lbls,"%",sep="") # ad % to labels
pie(slices,labels = lbls, main="Pie Chart")
legend(-1.5,1, c("default_house_loan"), cex = 0.7)
We can see that any combination having the credit in default is 0%, implying that the occurrence of these combinations is very low.
The combination of the credit not defaulted, but having a housing loan and not a personal loan occurs the most, while the combination of having your credit defaulted and having both personal and housing loans occurs the least amount of times