library(readxl)
bank <- read_excel("C:/Users/malik/OneDrive/Desktop/bank.xlsx")
View(bank)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.1
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(plotrix)
library(ggplot2)
#this tell how many na values are there
sum(is.na(bank))
## [1] 438
#removing the na values
bank<-na.omit(bank)
View(bank)
#checking/verifying the na values
sum(is.na(bank))
## [1] 0
# histogram based on days of loan
ggplot(data = bank, aes(x = day)) +
geom_histogram(binwidth = 5, fill = "blue", color = "black") +
labs(title = "Distribution of Days", x = "Day", y = "Frequency")

#Scatter Plot
# Sort the data by balance and loan in descending order and select the top 5 rows
top20 <- head(bank[order(-bank$balance, -bank$loan), ], 20)
# Create a scatter plot for the top 20 rows
ggplot(data = top20, aes(x = balance, y = loan)) +
geom_point() +
labs(title = "Scatter Plot of Top 20 Balances vs. Loans", x = "Balance", y = "Loan")

# Density Plot for Age
ggplot(bank, aes(x = age)) +
geom_density(fill = "blue", alpha = 0.5) +
labs(title = "Age Density Plot", x = "Age", y = "Density")

#pie chart
# Create a data frame with housing loan counts
housing_counts <- bank %>%
group_by(housing) %>%
summarise(count = n())
# Create the pie chart
ggplot(housing_counts, aes(x = "", y = count, fill = housing)) +
geom_bar(stat = "identity", width = 1, color = "white") +
coord_polar("y") +
labs(title = "Housing Loan Status", fill = "Housing", x = NULL, y = NULL) +
theme_void()

# Stacked Area Chart for Month and Campaign
month_campaign_counts <- bank %>%
group_by(month, campaign) %>%
summarise(count = n())
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
ggplot(month_campaign_counts, aes(x = month, y = count, fill = campaign)) +
geom_area() +
labs(title = "Campaign Activity Over Months",
x = "Month", y = "Count", fill = "Campaign")

# Calculate the measures of central tendency
mean_balance <- mean(bank$balance)
median_balance <- median(bank$balance)
mode_balance <- as.numeric(names(sort(table(bank$balance), decreasing = TRUE)[1]))
# Calculate the measures of dispersion
range_balance <- range(bank$balance)
variance_balance <- var(bank$balance)
std_deviation_balance <- sd(bank$balance)
q1_balance <- quantile(bank$balance, 0.25)
q3_balance <- quantile(bank$balance, 0.75)
iqr_balance <- q3_balance - q1_balance
# Print the results
cat("Measures of Central Tendency:\n")
## Measures of Central Tendency:
cat("Mean:", mean_balance, "\n")
## Mean: 493411.1
cat("Median:", median_balance, "\n")
## Median: 495070
cat("Mode:", mode_balance, "\n")
## Mode: 5442.22
cat("\nMeasures of Dispersion:\n")
##
## Measures of Dispersion:
cat("Range:", range_balance, "\n")
## Range: 5442.22 999607.1
cat("Variance:", variance_balance, "\n")
## Variance: 85359157325
cat("Standard Deviation:", std_deviation_balance, "\n")
## Standard Deviation: 292162.9
cat("Interquartile Range (IQR):", iqr_balance, "\n")
## Interquartile Range (IQR): 512276.5
Regression ananlysis
# Fit a simple linear regression model
model <- lm(balance ~ loan, data = bank)
# Create a scatter plot of the data
plot(bank$loan, bank$balance,
xlab = "Loan Amount", ylab = "Balance",
main = "Scatter Plot with Regression Line")
#regression line in the plot
abline(model, col = "red")

# Summary of the regression model
summary(model)
##
## Call:
## lm(formula = balance ~ loan, data = bank)
##
## Residuals:
## Min 1Q Median 3Q Max
## -495539 -253900 4562 249602 506920
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.778e+05 2.669e+04 17.906 <2e-16 ***
## loan 2.810e-01 4.345e-01 0.647 0.518
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 292300 on 640 degrees of freedom
## Multiple R-squared: 0.0006529, Adjusted R-squared: -0.0009085
## F-statistic: 0.4182 on 1 and 640 DF, p-value: 0.5181
k-mean Clustering
library(stats) # Required for kmeans function
data_for_clustering <- bank[, c("balance", "loan", "duration")]
# Check for and handle missing values
data_for_clustering <- na.omit(data_for_clustering)
# Standardize the data (mean = 0, variance = 1)
scaled_data <- scale(data_for_clustering)
# Choose the number of clusters (you can adjust this)
num_clusters <- 3
# Perform K-Means clustering
kmeans_result <- kmeans(scaled_data, centers = num_clusters)
# Explore the clusters and analyze their characteristics
cluster_summary <- aggregate(data_for_clustering, by = list(Cluster = kmeans_result$cluster), mean)
# Print the cluster summary
print(cluster_summary)
## Cluster balance loan duration
## 1 1 438774.4 28913.75 14.521739
## 2 2 494802.5 69749.10 5.949309
## 3 3 572022.4 76190.56 18.965116
knn classification
library(class)
# Split the dataset into a training set and a test set
set.seed(123) # For reproducibility
sample_size <- floor(0.7 * nrow(bank)) # Adjust the split ratio as needed
train_indices <- sample(1:nrow(bank), size = sample_size)
train_data <- bank[train_indices, ]
test_data <- bank[-train_indices, ]
# Define the number of neighbors (k)
k <- 5 # You can adjust this value based on your needs
# Remove rows with missing values from training and test data
train_data <- na.omit(train_data)
test_data <- na.omit(test_data)
# Perform k-NN classification
predicted_categories <- knn(train = train_data[, c("balance", "loan", "duration")],
test = test_data[, c("balance", "loan", "duration")],
cl = train_data$housing, k = k)
# Evaluate the classification performance
confusion_matrix <- table(Actual = test_data$housing, Predicted = predicted_categories)
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
# Print the confusion matrix and accuracy
print(confusion_matrix)
## Predicted
## Actual FALSE TRUE
## FALSE 47 47
## TRUE 39 60
cat("Accuracy:", accuracy)
## Accuracy: 0.5544041
apiriori
# Load necessary libraries
library(arules)
## Warning: package 'arules' was built under R version 4.2.3
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Attaching package: 'arules'
## The following object is masked from 'package:dplyr':
##
## recode
## The following objects are masked from 'package:base':
##
## abbreviate, write
# Convert your data to transactions format
transactions <- as(bank, "transactions")
## Warning: Column(s) 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16 not
## logical or factor. Applying default discretization (see '? discretizeDF').
# Mine association rules using Apriori algorithm
rules <- apriori(transactions, parameter = list(support = 0.01, confidence = 0.6))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.6 0.1 1 none FALSE TRUE 5 0.01 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 6
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[2104 item(s), 642 transaction(s)] done [0.00s].
## sorting and recoding items ... [57 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 7 done [0.01s].
## writing ... [18179 rule(s)] done [0.01s].
## creating S4 object ... done [0.00s].
# View the mined rules
inspect(rules[1:10])
## lhs rhs support
## [1] {job=Mechanical Systems Engineer} => {day=[11,21)} 0.01090343
## [2] {job=Senior Editor} => {credit_card=NO} 0.01090343
## [3] {job=Teacher} => {age=[18,40)} 0.01090343
## [4] {job=Teacher} => {balance=[6.67e+05,1e+06]} 0.01090343
## [5] {job=Senior Cost Accountant} => {campaign=[1,7)} 0.01090343
## [6] {job=Tax Accountant} => {month=[8,12]} 0.01090343
## [7] {job=Tax Accountant} => {housing} 0.01090343
## [8] {job=Project Manager} => {duration=[1,9)} 0.01246106
## [9] {job=Project Manager} => {credit_card=YES} 0.01246106
## [10] {job=Project Manager} => {housing} 0.01401869
## confidence coverage lift count
## [1] 0.8750000 0.01246106 2.687799 7
## [2] 0.8750000 0.01246106 1.739164 7
## [3] 0.7777778 0.01401869 2.400641 7
## [4] 0.7777778 0.01401869 2.333333 7
## [5] 0.7000000 0.01557632 2.328497 7
## [6] 0.7000000 0.01557632 1.695849 7
## [7] 0.7000000 0.01557632 1.321765 7
## [8] 0.6153846 0.02024922 1.881319 8
## [9] 0.6153846 0.02024922 1.238486 8
## [10] 0.6923077 0.02024922 1.307240 9