Major Project

library(readxl)
bank <- read_excel("C:/Users/malik/OneDrive/Desktop/bank.xlsx")
View(bank)
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   1.0.1 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.3.0      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 1.0.0 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(plotrix)
library(ggplot2)

#this tell how many na values are there
sum(is.na(bank))

## [1] 438

#removing the na values
bank<-na.omit(bank)
View(bank)

#checking/verifying the na values
sum(is.na(bank))

## [1] 0

# histogram based on days of loan
ggplot(data = bank, aes(x = day)) +
  geom_histogram(binwidth = 5, fill = "blue", color = "black") +
  labs(title = "Distribution of Days", x = "Day", y = "Frequency")

#Scatter Plot
# Sort the data by balance and loan in descending order and select the top 5 rows
top20 <- head(bank[order(-bank$balance, -bank$loan), ], 20)

# Create a scatter plot for the top 20 rows
ggplot(data = top20, aes(x = balance, y = loan)) +
  geom_point() +
  labs(title = "Scatter Plot of Top 20 Balances vs. Loans", x = "Balance", y = "Loan")

# Density Plot for Age
ggplot(bank, aes(x = age)) +
  geom_density(fill = "blue", alpha = 0.5) +
  labs(title = "Age Density Plot", x = "Age", y = "Density")

#pie chart
# Create a data frame with housing loan counts
housing_counts <- bank %>%
  group_by(housing) %>%
  summarise(count = n())

# Create the pie chart
ggplot(housing_counts, aes(x = "", y = count, fill = housing)) +
  geom_bar(stat = "identity", width = 1, color = "white") +
  coord_polar("y") +
  labs(title = "Housing Loan Status", fill = "Housing", x = NULL, y = NULL) +
  theme_void()

# Stacked Area Chart for Month and Campaign
month_campaign_counts <- bank %>%
  group_by(month, campaign) %>%
  summarise(count = n())

## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.

ggplot(month_campaign_counts, aes(x = month, y = count, fill = campaign)) +
  geom_area() +
  labs(title = "Campaign Activity Over Months", 
       x = "Month", y = "Count", fill = "Campaign")

# Calculate the measures of central tendency
mean_balance <- mean(bank$balance)
median_balance <- median(bank$balance)
mode_balance <- as.numeric(names(sort(table(bank$balance), decreasing = TRUE)[1]))

# Calculate the measures of dispersion
range_balance <- range(bank$balance)
variance_balance <- var(bank$balance)
std_deviation_balance <- sd(bank$balance)
q1_balance <- quantile(bank$balance, 0.25)
q3_balance <- quantile(bank$balance, 0.75)
iqr_balance <- q3_balance - q1_balance

# Print the results
cat("Measures of Central Tendency:\n")

## Measures of Central Tendency:

cat("Mean:", mean_balance, "\n")

## Mean: 493411.1

cat("Median:", median_balance, "\n")

## Median: 495070

cat("Mode:", mode_balance, "\n")

## Mode: 5442.22

cat("\nMeasures of Dispersion:\n")

## 
## Measures of Dispersion:

cat("Range:", range_balance, "\n")

## Range: 5442.22 999607.1

cat("Variance:", variance_balance, "\n")

## Variance: 85359157325

cat("Standard Deviation:", std_deviation_balance, "\n")

## Standard Deviation: 292162.9

cat("Interquartile Range (IQR):", iqr_balance, "\n")

## Interquartile Range (IQR): 512276.5

Regression ananlysis

# Fit a simple linear regression model
model <- lm(balance ~ loan, data = bank)

# Create a scatter plot of the data
plot(bank$loan, bank$balance, 
     xlab = "Loan Amount", ylab = "Balance",
     main = "Scatter Plot with Regression Line")

#regression line in the plot
abline(model, col = "red")

# Summary of the regression model
summary(model)

## 
## Call:
## lm(formula = balance ~ loan, data = bank)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -495539 -253900    4562  249602  506920 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 4.778e+05  2.669e+04  17.906   <2e-16 ***
## loan        2.810e-01  4.345e-01   0.647    0.518    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 292300 on 640 degrees of freedom
## Multiple R-squared:  0.0006529,  Adjusted R-squared:  -0.0009085 
## F-statistic: 0.4182 on 1 and 640 DF,  p-value: 0.5181

k-mean Clustering

library(stats)  # Required for kmeans function


data_for_clustering <- bank[, c("balance", "loan", "duration")]

# Check for and handle missing values
data_for_clustering <- na.omit(data_for_clustering)

# Standardize the data (mean = 0, variance = 1)
scaled_data <- scale(data_for_clustering)

# Choose the number of clusters (you can adjust this)
num_clusters <- 3

# Perform K-Means clustering
kmeans_result <- kmeans(scaled_data, centers = num_clusters)

# Explore the clusters and analyze their characteristics
cluster_summary <- aggregate(data_for_clustering, by = list(Cluster = kmeans_result$cluster), mean)

# Print the cluster summary
print(cluster_summary)

##   Cluster  balance     loan  duration
## 1       1 438774.4 28913.75 14.521739
## 2       2 494802.5 69749.10  5.949309
## 3       3 572022.4 76190.56 18.965116

knn classification

library(class)
# Split the dataset into a training set and a test set
set.seed(123)  # For reproducibility
sample_size <- floor(0.7 * nrow(bank))  # Adjust the split ratio as needed
train_indices <- sample(1:nrow(bank), size = sample_size)
train_data <- bank[train_indices, ]
test_data <- bank[-train_indices, ]

# Define the number of neighbors (k)
k <- 5  # You can adjust this value based on your needs

# Remove rows with missing values from training and test data
train_data <- na.omit(train_data)
test_data <- na.omit(test_data)

# Perform k-NN classification
predicted_categories <- knn(train = train_data[, c("balance", "loan", "duration")], 
                            test = test_data[, c("balance", "loan", "duration")], 
                            cl = train_data$housing, k = k)

# Evaluate the classification performance
confusion_matrix <- table(Actual = test_data$housing, Predicted = predicted_categories)
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)

# Print the confusion matrix and accuracy
print(confusion_matrix)

##        Predicted
## Actual  FALSE TRUE
##   FALSE    47   47
##   TRUE     39   60

cat("Accuracy:", accuracy)

## Accuracy: 0.5544041

apiriori

# Load necessary libraries
library(arules)

## Warning: package 'arules' was built under R version 4.2.3

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## 
## Attaching package: 'arules'

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following objects are masked from 'package:base':
## 
##     abbreviate, write

# Convert your data to transactions format
transactions <- as(bank, "transactions")

## Warning: Column(s) 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16 not
## logical or factor. Applying default discretization (see '? discretizeDF').

# Mine association rules using Apriori algorithm
rules <- apriori(transactions, parameter = list(support = 0.01, confidence = 0.6))

## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.6    0.1    1 none FALSE            TRUE       5    0.01      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 6 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[2104 item(s), 642 transaction(s)] done [0.00s].
## sorting and recoding items ... [57 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 7 done [0.01s].
## writing ... [18179 rule(s)] done [0.01s].
## creating S4 object  ... done [0.00s].

# View the mined rules
inspect(rules[1:10])

##      lhs                                  rhs                        support   
## [1]  {job=Mechanical Systems Engineer} => {day=[11,21)}              0.01090343
## [2]  {job=Senior Editor}               => {credit_card=NO}           0.01090343
## [3]  {job=Teacher}                     => {age=[18,40)}              0.01090343
## [4]  {job=Teacher}                     => {balance=[6.67e+05,1e+06]} 0.01090343
## [5]  {job=Senior Cost Accountant}      => {campaign=[1,7)}           0.01090343
## [6]  {job=Tax Accountant}              => {month=[8,12]}             0.01090343
## [7]  {job=Tax Accountant}              => {housing}                  0.01090343
## [8]  {job=Project Manager}             => {duration=[1,9)}           0.01246106
## [9]  {job=Project Manager}             => {credit_card=YES}          0.01246106
## [10] {job=Project Manager}             => {housing}                  0.01401869
##      confidence coverage   lift     count
## [1]  0.8750000  0.01246106 2.687799 7    
## [2]  0.8750000  0.01246106 1.739164 7    
## [3]  0.7777778  0.01401869 2.400641 7    
## [4]  0.7777778  0.01401869 2.333333 7    
## [5]  0.7000000  0.01557632 2.328497 7    
## [6]  0.7000000  0.01557632 1.695849 7    
## [7]  0.7000000  0.01557632 1.321765 7    
## [8]  0.6153846  0.02024922 1.881319 8    
## [9]  0.6153846  0.02024922 1.238486 8    
## [10] 0.6923077  0.02024922 1.307240 9

Major Project

Arsh Malik

2023-11-06

Regression ananlysis

k-mean Clustering

knn classification

apiriori