Here is a Banking Analysis code.
Getting started with loading all required libraries.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.1
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(plotrix)
library(ggplot2)
Loading dataset named banking
library(readr)
banking <- read_csv("C:/Users/malik/Downloads/banking.csv")
## Rows: 1000 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): name, email, phone_number, job
## dbl (8): customer_id, age, balance, loan, day, month, year, campaign
## lgl (1): housing
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(banking)
Performing Data Cleaning
Get dataset’s description
print(paste("Dimensions of dataset: ", dim(banking)))
## [1] "Dimensions of dataset: 1000" "Dimensions of dataset: 13"
Summary of my Dataset
summary(banking)
## name email customer_id age
## Length:1000 Length:1000 Min. : 1.0 Min. :18.00
## Class :character Class :character 1st Qu.: 250.8 1st Qu.:34.00
## Mode :character Mode :character Median : 500.5 Median :51.00
## Mean : 500.5 Mean :51.21
## 3rd Qu.: 750.2 3rd Qu.:69.00
## Max. :1000.0 Max. :85.00
## NA's :88
## phone_number balance loan housing
## Length:1000 Min. : 5442 Min. :10024 Mode :logical
## Class :character 1st Qu.:249472 1st Qu.:32650 FALSE:480
## Mode :character Median :515405 Median :53930 TRUE :486
## Mean :507163 Mean :54886 NA's :34
## 3rd Qu.:761599 3rd Qu.:77593
## Max. :999607 Max. :99977
## NA's :42
## day month year campaign
## Min. : 1.00 Min. : 1.000 Min. :1995 Min. : 1.00
## 1st Qu.: 9.00 1st Qu.: 3.000 1st Qu.:2001 1st Qu.: 6.00
## Median :17.00 Median : 7.000 Median :2008 Median :11.00
## Mean :16.55 Mean : 6.527 Mean :2008 Mean :10.61
## 3rd Qu.:24.00 3rd Qu.:10.000 3rd Qu.:2016 3rd Qu.:16.00
## Max. :31.00 Max. :12.000 Max. :2022 Max. :20.00
## NA's :39
## job
## Length:1000
## Class :character
## Mode :character
##
##
##
##
Number of na values exists in dataset-banking
sum(is.na(banking))
## [1] 354
Removing the na values
bank<-na.omit(banking)
View(bank)
Verifying na values values exists or not
sum(is.na(bank))
## [1] 0
HISTOGRAM:-
# histogram based on days of loan
ggplot(data = bank, aes(x = day)) +
geom_histogram(binwidth = 5,
fill = "lightgreen",
color = "black") +
labs(title="Distribution of Days",
x="Day",
y="Frequency")
PIE CHART:-
# Create a data frame with housing loan counts
housing_counts <- bank %>%
group_by(housing) %>%
summarise(count = n())
# Create the pie chart
ggplot(housing_counts, aes(x = "", y = count, fill = housing)) +
geom_bar(stat = "identity",
width = 1,
color = "white") +
coord_polar("y") +
labs(title = "Housing Loan Status",
fill = "Housing",
x = NULL,
y = NULL) +
theme_void()
Density Plot
# Density Plot for Age
ggplot(bank, aes(x = age)) +
geom_density(fill = "blue", alpha = 0.5) +
labs(title = "Age Density Plot",
x = "Age",
y = "Density")
SCATTER PLOT:-
# Sort the data by balance and loan in descending order and select the top 5 rows
top20 <- head(bank[order(-bank$balance, -bank$loan), ], 20)
# Create a scatter plot for the top 20 rows
ggplot(data = top20, aes(x = balance, y = loan)) +
geom_point() +
labs(title = "Scatter Plot of Top 20 Balances vs. Loans", x = "Balance", y = "Loan")
LINE CHART:-
bank %>%
group_by(year) %>%
summarise(mean_balance = mean(balance)) %>%
ggplot(aes(x = year, y = mean_balance)) +
geom_line() +
labs(title = "Average Balance Over Years",
x = "Year",
y = "Average Balance")
# Stacked Area Chart for Month and Campaign
month_campaign_counts <- bank %>%
group_by(month, campaign) %>%
summarise(count = n())
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
ggplot(month_campaign_counts, aes(x = month, y = count, fill = campaign)) +
geom_area() +
labs(title = "Campaign Activity Over Months",
x = "Month", y = "Count", fill = "Campaign")
Performing Measure of Central Tendency:-
# Calculate the measures of central tendency
mean_balance <- mean(bank$balance)
median_balance <- median(bank$balance)
mode_balance <- as.numeric(names(sort(table(bank$balance), decreasing = TRUE)[1]))
# Calculate the measures of dispersion
range_balance <- range(bank$balance)
variance_balance <- var(bank$balance)
std_deviation_balance <- sd(bank$balance)
q1_balance <- quantile(bank$balance, 0.25)
q3_balance <- quantile(bank$balance, 0.75)
iqr_balance <- q3_balance - q1_balance
# Display the Central Tendency
print("Measures of Central Tendency:\n")
## [1] "Measures of Central Tendency:\n"
cat("Mean:", mean_balance, "\n")
## Mean: 494321.5
cat("Median:", median_balance, "\n")
## Median: 497429.6
cat("Mode:", mode_balance, "\n")
## Mode: 5442.22
#Display the Measures of Dispersion
print("\nMeasures of Dispersion:\n")
## [1] "\nMeasures of Dispersion:\n"
cat("Range:", range_balance, "\n")
## Range: 5442.22 999607.1
cat("Variance:", variance_balance, "\n")
## Variance: 83574460647
cat("Standard Deviation:", std_deviation_balance, "\n")
## Standard Deviation: 289092.5
cat("Interquartile Range (IQR):", iqr_balance, "\n")
## Interquartile Range (IQR): 495985.4
Conclusion
#10 highest loan borrowers
highestloan_10<- bank[order(-bank$loan), ][1:10, ]
highestloan_10
## # A tibble: 10 × 13
## name email custo…¹ age phone…² balance loan housing day month year
## <chr> <chr> <dbl> <dbl> <chr> <dbl> <dbl> <lgl> <dbl> <dbl> <dbl>
## 1 Sabra sbott… 389 82 263-91… 200990. 99977. FALSE 26 9 1995
## 2 Wit wfree… 180 35 965-73… 183993. 99881. TRUE 21 5 2002
## 3 Minna mmang… 149 69 129-34… 883715. 99870. TRUE 18 2 2016
## 4 Fionna fmart… 966 73 564-36… 12440. 99853. TRUE 31 2 2004
## 5 Homerus harne… 191 34 785-16… 122084. 99778. TRUE 30 6 2001
## 6 Dugald dmatt… 187 65 539-50… 890699. 99717. FALSE 26 5 2018
## 7 Cassie cmcca… 335 68 186-46… 924931. 99577. FALSE 20 1 2018
## 8 Sandi smonc… 297 73 886-52… 48548. 99478. TRUE 14 11 2018
## 9 Massimo mflam… 277 39 887-47… 536319. 99088. TRUE 26 12 2009
## 10 Kassi kbidm… 790 32 525-84… 657573. 98964. TRUE 17 3 2018
## # … with 2 more variables: campaign <dbl>, job <chr>, and abbreviated variable
## # names ¹customer_id, ²phone_number
#10highest balance customers
highestbal_10<-bank[order(-bank$balance),][1:10, ]
highestbal_10
## # A tibble: 10 × 13
## name email custo…¹ age phone…² balance loan housing day month year
## <chr> <chr> <dbl> <dbl> <chr> <dbl> <dbl> <lgl> <dbl> <dbl> <dbl>
## 1 Danette dort… 469 65 406-17… 999607. 75049. TRUE 5 7 2014
## 2 Yardley ymas… 928 35 970-31… 991077. 76478. TRUE 5 9 2001
## 3 Antonie… acoc… 549 27 992-57… 990204. 57300. FALSE 7 5 2002
## 4 Hilly hsze… 513 68 294-72… 990097. 48897. FALSE 17 7 2015
## 5 Elonore eyou… 398 74 785-60… 989874. 37838. TRUE 13 4 2019
## 6 Pam psie… 862 53 409-90… 989630. 17295. TRUE 19 8 2017
## 7 Jackque… jwad… 31 28 507-53… 986247. 80061. FALSE 31 3 1996
## 8 Corinne cpen… 39 36 488-41… 985569. 81940. TRUE 8 8 2019
## 9 Ranna rrym… 79 52 945-87… 984917. 52647. FALSE 8 8 2011
## 10 Mechelle msta… 574 19 256-16… 983253. 76958. TRUE 4 7 2000
## # … with 2 more variables: campaign <dbl>, job <chr>, and abbreviated variable
## # names ¹customer_id, ²phone_number
Through above analysis and visualization we got to know about patterns and trends- how many people are joined with the bank, which age group of are enganged, how many customers are participating in campaign activities, highest balance and loan, customer using housing loan or not.