This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(mlbench)
library(readr)
library(data.table)
library(stats)
library(arules)
library(arulesViz)
library(kernlab)
library(Matrix)
bank_full = read.csv("C:\\Users\\huiwu\\OneDrive\\Desktop\\CAU\\Spring2025\\R_DataScience\\bank-full.csv")#,header=T, sep=";")
summary(bank_full)
head(bank_full, n=5)
##Take a little bit look at the data, the following is the top ##5 rows of the data set
bank_full=bank_full[,c(1,2,3,4,6,7,8, 11)]
##extract the most important 8 (out of 17) columns from this ##data set to analyze
str(bank_full)
##More in depth information
require(lattice)
require(ggplot2)
ggplot(bank_full, aes(x=age)) + geom_bar()
##This creates a simple bar chart showing the overall age ##distribution in the banking dataset
ggplot(bank_full, aes(x=job)) + geom_bar()
ggplot(bank_full, aes(x=marital)) + geom_bar()
ggplot(bank_full, aes(x=education)) + geom_bar()
ggplot(bank_full, aes(x=housing)) + geom_bar()
ggplot(bank_full, aes(x=loan)) + geom_bar()
ggplot(bank_full, aes(x=month)) + geom_bar()
ggplot(bank_full, aes(age)) + geom_bar(fill="skyblue") + facet_wrap(~job)
##This ggplot creates a multi-panel bar chart showing the age ##distribution across different job categories in the ##banking dataset.
ggplot(bank_full, aes(education)) + geom_bar(fill="skyblue") + facet_wrap(~job)
ggplot(bank_full, aes(marital)) + geom_bar(fill="skyblue") + facet_wrap(~job)
ggplot(bank_full, aes(housing)) + geom_bar(fill="skyblue") + facet_wrap(~loan)
ggplot(bank_full, aes(marital)) + geom_bar(fill="skyblue") + facet_wrap(~loan)
ggplot(bank_full, aes(job)) + geom_bar(fill="skyblue") + facet_wrap(~loan)
ggplot(bank_full, aes(education)) + geom_bar(fill="skyblue") + facet_wrap(~loan)
balance <- ggplot(bank_full, aes(factor(job), balance))
balance + geom_boxplot()
##Visualizing of balance with outliers
balance + geom_boxplot(outlier.shape = NA) + scale_y_continuous(limits = quantile(bank_full$balance, c(0.1, 0.9)))
##Visualizing of balance without outliers
bankrules <- apriori(bank_full, parameter=list(support=0.025, confidence=0.75,minlen = 2)) bankrules
inspect(bankrules[1:10])
##Display the first 10 association rules from bankrules
bankrules <- apriori(bank_full, parameter=list(support=0.025, confidence=0.75,minlen = 3)) bankrules
inspect(bankrules[1:10])
bankrules <- sort(bankrules, by = “confidence”, decreasing = TRUE)
##This command sorts your association rules by confidence in ##descending order, so the rules with the highest confidence ##(strongest predictive power) will appear first
bankrules.sorted = sort(bankrules, by = “lift”)
##Sorts rules by lift in ascending order (lowest lift first)
subset.matrix = is.subset(bankrules.sorted, bankrules.sorted)
##Creates a matrix showing which rules are subsets of others
subset.matrix[lower.tri(subset.matrix, diag = TRUE)] = NA
##Sets the lower triangle and diagonal to NA ##Keeps only the upper triangle to avoid duplicate ##comparisons
redundant = colSums(subset.matrix, na.rm = TRUE) >= 100
##Counts how many rules are subsets of each rule (column-wise) ##Marks rules as redundant if they have 100+ subset rules ##This is a very high threshold - these would be extremely ##general rules
which(redundant)
##Shows the positions/indices of the redundant rules ##If this returns an empty result, no rules had 100+ subsets
bankrules.pruned = bankrules.sorted[!redundant]
##Keeps only the non-redundant rules (using the negation !) ##Creates a cleaner, more focused rule set
inspect(bankrules.pruned)
##Displays all the remaining rules after pruning
bankrules <- sort(bankrules, by = “lift”, decreasing = TRUE) inspect(bankrules[1:6])
##sorts the association rules by lift in descending order ## and displays the top 6 rules with the highest lift values
summary(bankrules)
##Total number of rules found ##Distribution of rule lengths (how many items in antecedent ##+ consequent) ##Summary statistics for support, confidence, coverage, lift, ##and count ##Quartiles and mean values for each metric
inspect(sort(bankrules, by = “confidence”)[1:5])
##shows the 5 rules with the lowest confidence
inspect(sort(bankrules, by = “lift”, decreasing = TRUE) [1:5])
##This command shows the 5 association rules with the ##highest lift values - these are your strongest, most ##surprising patterns in the banking data.
plot(bankrules, measure = c(“support”, “lift”), shading = “confidence”)
plot(bankrules.pruned, measure = c(“support”, “lift”), shading = “confidence”)
##This creates a scatter plot visualization of your ##association rules using the arulesViz package. Here’s what ##this plot shows: ##Plot Elements:
##X-axis: Support (how frequently the rule appears in your ##data) ##Y-axis: Lift (how much stronger the association is compared ##to random) ##Color/Shading: Confidence (how reliable each rule is) ##Each point: Represents one association rule
```