Project_250617

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:


library(mlbench)
library(readr)
library(data.table)
library(stats)
library(arules)
library(arulesViz)
library(kernlab)
library(Matrix)

bank_full = read.csv("C:\\Users\\huiwu\\OneDrive\\Desktop\\CAU\\Spring2025\\R_DataScience\\bank-full.csv")#,header=T, sep=";")
summary(bank_full)
head(bank_full, n=5)

##Take a little bit look at the data, the following is the top ##5 rows of the data set

bank_full=bank_full[,c(1,2,3,4,6,7,8, 11)]

##extract the most important 8 (out of 17) columns from this ##data set to analyze 

str(bank_full)

##More in depth information

require(lattice)
require(ggplot2)

ggplot(bank_full, aes(x=age)) + geom_bar()

##This creates a simple bar chart showing the overall age ##distribution in the banking dataset

ggplot(bank_full, aes(x=job)) + geom_bar()

ggplot(bank_full, aes(x=marital)) + geom_bar()

ggplot(bank_full, aes(x=education)) + geom_bar()

ggplot(bank_full, aes(x=housing)) + geom_bar()

ggplot(bank_full, aes(x=loan)) + geom_bar()


ggplot(bank_full, aes(x=month)) + geom_bar()


ggplot(bank_full, aes(age)) + geom_bar(fill="skyblue") +  facet_wrap(~job)

##This ggplot creates a multi-panel bar chart showing the age ##distribution across different job categories in the ##banking dataset.

ggplot(bank_full, aes(education)) + geom_bar(fill="skyblue") +  facet_wrap(~job)

ggplot(bank_full, aes(marital)) + geom_bar(fill="skyblue") +  facet_wrap(~job)

ggplot(bank_full, aes(housing)) + geom_bar(fill="skyblue") +  facet_wrap(~loan)

ggplot(bank_full, aes(marital)) + geom_bar(fill="skyblue") +  facet_wrap(~loan)

ggplot(bank_full, aes(job)) + geom_bar(fill="skyblue") +  facet_wrap(~loan)

ggplot(bank_full, aes(education)) + geom_bar(fill="skyblue") +  facet_wrap(~loan)



balance <- ggplot(bank_full, aes(factor(job), balance))
balance + geom_boxplot()

##Visualizing of balance with outliers

balance + geom_boxplot(outlier.shape = NA) + scale_y_continuous(limits = quantile(bank_full$balance, c(0.1, 0.9)))


##Visualizing of balance without outliers

bankrules <- apriori(bank_full, parameter=list(support=0.025, confidence=0.75,minlen = 2)) bankrules

Items/itemsets must appear in at least 2.5% of transactions

Rules must have at least 75% confidence (strong rules)

Rules must have at least 2 items

inspect(bankrules[1:10])

##Display the first 10 association rules from bankrules

bankrules <- apriori(bank_full, parameter=list(support=0.025, confidence=0.75,minlen = 3)) bankrules

inspect(bankrules[1:10])

bankrules <- sort(bankrules, by = “confidence”, decreasing = TRUE)

##This command sorts your association rules by confidence in ##descending order, so the rules with the highest confidence ##(strongest predictive power) will appear first

bankrules.sorted = sort(bankrules, by = “lift”)

##Sorts rules by lift in ascending order (lowest lift first)

subset.matrix = is.subset(bankrules.sorted, bankrules.sorted)

##Creates a matrix showing which rules are subsets of others

subset.matrix[lower.tri(subset.matrix, diag = TRUE)] = NA

##Sets the lower triangle and diagonal to NA ##Keeps only the upper triangle to avoid duplicate ##comparisons

redundant = colSums(subset.matrix, na.rm = TRUE) >= 100

##Counts how many rules are subsets of each rule (column-wise) ##Marks rules as redundant if they have 100+ subset rules ##This is a very high threshold - these would be extremely ##general rules

which(redundant)

##Shows the positions/indices of the redundant rules ##If this returns an empty result, no rules had 100+ subsets

bankrules.pruned = bankrules.sorted[!redundant]

##Keeps only the non-redundant rules (using the negation !) ##Creates a cleaner, more focused rule set

inspect(bankrules.pruned)

##Displays all the remaining rules after pruning

bankrules <- sort(bankrules, by = “lift”, decreasing = TRUE) inspect(bankrules[1:6])

##sorts the association rules by lift in descending order ## and displays the top 6 rules with the highest lift values

summary(bankrules)

provides overview statistics:

##Total number of rules found ##Distribution of rule lengths (how many items in antecedent ##+ consequent) ##Summary statistics for support, confidence, coverage, lift, ##and count ##Quartiles and mean values for each metric

inspect(sort(bankrules, by = “confidence”)[1:5])

##shows the 5 rules with the lowest confidence

inspect(sort(bankrules, by = “lift”, decreasing = TRUE) [1:5])

##This command shows the 5 association rules with the ##highest lift values - these are your strongest, most ##surprising patterns in the banking data.

plot(bankrules, measure = c(“support”, “lift”), shading = “confidence”)

plot(bankrules.pruned, measure = c(“support”, “lift”), shading = “confidence”)

##This creates a scatter plot visualization of your ##association rules using the arulesViz package. Here’s what ##this plot shows: ##Plot Elements:

##X-axis: Support (how frequently the rule appears in your ##data) ##Y-axis: Lift (how much stronger the association is compared ##to random) ##Color/Shading: Confidence (how reliable each rule is) ##Each point: Represents one association rule

```