Here is a Banking Analysis code.
Getting started with loading all required libraries.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.1
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(plotrix)
library(ggplot2)
Loading dataset named banking
library(readr)
banking <- read_csv("C:/Users/malik/Downloads/banking.csv")
## Rows: 1000 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): name, email, phone_number, job
## dbl (8): customer_id, age, balance, loan, day, month, year, campaign
## lgl (1): housing
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(banking)
Performing Data Cleaning
Get dataset’s description
print(paste("Dimensions of dataset: ", dim(banking)))
## [1] "Dimensions of dataset: 1000" "Dimensions of dataset: 13"
Summary of my Dataset
summary(banking)
## name email customer_id age
## Length:1000 Length:1000 Min. : 1.0 Min. :18.00
## Class :character Class :character 1st Qu.: 250.8 1st Qu.:34.00
## Mode :character Mode :character Median : 500.5 Median :51.00
## Mean : 500.5 Mean :51.21
## 3rd Qu.: 750.2 3rd Qu.:69.00
## Max. :1000.0 Max. :85.00
## NA's :88
## phone_number balance loan housing
## Length:1000 Min. : 5442 Min. :10024 Mode :logical
## Class :character 1st Qu.:249472 1st Qu.:32650 FALSE:480
## Mode :character Median :515405 Median :53930 TRUE :486
## Mean :507163 Mean :54886 NA's :34
## 3rd Qu.:761599 3rd Qu.:77593
## Max. :999607 Max. :99977
## NA's :42
## day month year campaign
## Min. : 1.00 Min. : 1.000 Min. :1995 Min. : 1.00
## 1st Qu.: 9.00 1st Qu.: 3.000 1st Qu.:2001 1st Qu.: 6.00
## Median :17.00 Median : 7.000 Median :2008 Median :11.00
## Mean :16.55 Mean : 6.527 Mean :2008 Mean :10.61
## 3rd Qu.:24.00 3rd Qu.:10.000 3rd Qu.:2016 3rd Qu.:16.00
## Max. :31.00 Max. :12.000 Max. :2022 Max. :20.00
## NA's :39
## job
## Length:1000
## Class :character
## Mode :character
##
##
##
##
Number of na values exists in dataset-banking
sum(is.na(banking))
## [1] 354
Removing the na values
bank<-na.omit(banking)
View(bank)
Verifying na values values exists or not
sum(is.na(bank))
## [1] 0
HISTOGRAM:-
# histogram based on days of loan
ggplot(data = bank, aes(x = day)) +
geom_histogram(binwidth = 5,
fill = "lightgreen",
color = "black") +
labs(title="Distribution of Days",
x="Day",
y="Frequency")
PIE CHART:-
# Create a data frame with housing loan counts
housing_counts <- bank %>%
group_by(housing) %>%
summarise(count = n())
# Create the pie chart
ggplot(housing_counts, aes(x = "", y = count, fill = housing)) +
geom_bar(stat = "identity",
width = 1,
color = "white") +
coord_polar("y") +
labs(title = "Housing Loan Status",
fill = "Housing",
x = NULL,
y = NULL) +
theme_void()
Density Plot
# Density Plot for Age
ggplot(bank, aes(x = age)) +
geom_density(fill = "blue", alpha = 0.5) +
labs(title = "Age Density Plot",
x = "Age",
y = "Density")
SCATTER PLOT:-
# Sort the data by balance and loan in descending order and select the top 5 rows
top20 <- head(bank[order(-bank$balance, -bank$loan), ], 20)
# Create a scatter plot for the top 20 rows
ggplot(data = top20, aes(x = balance, y = loan)) +
geom_point() +
labs(title = "Scatter Plot of Top 20 Balances vs. Loans", x = "Balance", y = "Loan")
LINE CHART:-
bank %>%
group_by(year) %>%
summarise(mean_balance = mean(balance)) %>%
ggplot(aes(x = year, y = mean_balance)) +
geom_line() +
labs(title = "Average Balance Over Years",
x = "Year",
y = "Average Balance")
# Stacked Area Chart for Month and Campaign
month_campaign_counts <- bank %>%
group_by(month, campaign) %>%
summarise(count = n())
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
ggplot(month_campaign_counts, aes(x = month, y = count, fill = campaign)) +
geom_area() +
labs(title = "Campaign Activity Over Months",
x = "Month", y = "Count", fill = "Campaign")
Performing Measure of Central Tendency:-
# Calculate the measures of central tendency
mean_balance <- mean(bank$balance)
median_balance <- median(bank$balance)
mode_balance <- as.numeric(names(sort(table(bank$balance), decreasing = TRUE)[1]))
# Calculate the measures of dispersion
range_balance <- range(bank$balance)
variance_balance <- var(bank$balance)
std_deviation_balance <- sd(bank$balance)
q1_balance <- quantile(bank$balance, 0.25)
q3_balance <- quantile(bank$balance, 0.75)
iqr_balance <- q3_balance - q1_balance
# Display the Central Tendency
print("Measures of Central Tendency:\n")
## [1] "Measures of Central Tendency:\n"
cat("Mean:", mean_balance, "\n")
## Mean: 494321.5
cat("Median:", median_balance, "\n")
## Median: 497429.6
cat("Mode:", mode_balance, "\n")
## Mode: 5442.22
#Display the Measures of Dispersion
print("\nMeasures of Dispersion:\n")
## [1] "\nMeasures of Dispersion:\n"
cat("Range:", range_balance, "\n")
## Range: 5442.22 999607.1
cat("Variance:", variance_balance, "\n")
## Variance: 83574460647
cat("Standard Deviation:", std_deviation_balance, "\n")
## Standard Deviation: 289092.5
cat("Interquartile Range (IQR):", iqr_balance, "\n")
## Interquartile Range (IQR): 495985.4