European Employment Data
euro.data = read.table('data/europeanJobs.txt', header=T)
Warning message:
package ‘arules’ was built under R version 3.6.3
#summary(euro.data)
set.seed(13603433)
index <- sample(nrow(euro.data),nrow(euro.data)*0.80)
euro.data.train <- euro.data[index,2:10]
euro.data.text <- euro.data[-index,]
euro.data.train.dist=dist(euro.data.train)
euro.data.train.hclust=hclust(euro.data.train.dist, method="ward")
The "ward" method has been renamed to "ward.D"; note new "ward.D2"
plot(euro.data.train.hclust)

print('Output for K = 3')
[1] "Output for K = 3"
#Cut dendrogram at the 3 clusters level and obtain cluster membership
euro.data.train.3clust = cutree(euro.data.train.hclust,k=3)
#See exactly which item are in third group
euro.data.train[euro.data.train.3clust==3,]
#get cluster means for raw data
#Centroid Plot against 1st 2 discriminant functions
#Load the fpc library needed for plotcluster function
library(fpc)
package 㤼㸱fpc㤼㸲 was built under R version 3.6.3
#plotcluster(ZooFood, fit$cluster)
plotcluster(euro.data.train, euro.data.train.3clust)

print('Output for K = 4')
[1] "Output for K = 4"
euro.data.train.3clust = cutree(euro.data.train.hclust,k=4)
euro.data.train[euro.data.train.3clust==4,]
plotcluster(euro.data.train, euro.data.train.3clust)

print('Output for K = 5')
[1] "Output for K = 5"
euro.data.train.3clust = cutree(euro.data.train.hclust,k=5)
euro.data.train[euro.data.train.3clust==5,]
plotcluster(euro.data.train, euro.data.train.3clust)

library(factoextra)
package 㤼㸱factoextra㤼㸲 was built under R version 3.6.3Loading required package: ggplot2
Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
distance <- get_dist(euro.data.train)
fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

seed <- euro.data.train
# K-Means Cluster Analysis
fit <- kmeans(seed, 2) #2 cluster solution
#Display number of clusters in each cluster
table(fit$cluster)
1 2
7 13
fit
K-means clustering with 2 clusters of sizes 7, 13
Cluster means:
Agr Min Man PS Con SI Fin SPS TC
1 27.9 1.400000 26.18571 0.900000 8.614286 10.65714 2.757143 15.34286 6.242857
2 8.6 1.215385 30.26154 1.007692 8.423077 15.82308 4.492308 23.19231 6.946154
Clustering vector:
19 1 11 20 24 10 16 14 5 6 15 8 21 13 17 4 7 12 22 9
1 2 2 2 1 2 2 1 1 2 1 2 2 2 2 2 2 1 1 2
Within cluster sum of squares by cluster:
[1] 741.3857 1181.2138
(between_SS / total_SS = 53.2 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss" "betweenss" "size" "iter" "ifault"
fviz_cluster(fit, data = seed)

fviz_cluster(fit, data = seed)

k3 <- kmeans(seed, centers = 3, nstart = 25)
k4 <- kmeans(seed, centers = 4, nstart = 25)
k5 <- kmeans(seed, centers = 5, nstart = 25)
# plots to compare
p1 <- fviz_cluster(fit, geom = "point", data = seed) + ggtitle("k = 2")
p2 <- fviz_cluster(k3, geom = "point", data = seed) + ggtitle("k = 3")
p3 <- fviz_cluster(k4, geom = "point", data = seed) + ggtitle("k = 4")
p4 <- fviz_cluster(k5, geom = "point", data = seed) + ggtitle("k = 5")
library(gridExtra)
grid.arrange(p1, p2, p3, p4, nrow = 2)

library(fpc)
plotcluster(seed, fit$cluster)

#See exactly which item are in 1st group
seed[fit$cluster==1,]
#get cluster means for scaled data
aggregate(seed,by=list(fit$cluster),FUN=mean)
# Determine number of clusters
wss <- (nrow(seed)-1)*sum(apply(seed,2,var))
for (i in 2:12) wss[i] <- sum(kmeans(seed,
centers=i)$withinss)
plot(1:12, wss, type="b", xlab="Number of Clusters",ylab="Within groups sum of squares")

Cincinnati Zoo Data
library(arules)
TransFood <- read.csv('https://xiaoruizhu.github.io/Data-Mining-R/data/food_4_association.csv')
TransFood <- TransFood[, -1]
# Find out elements that are not equal to 0 or 1 and change them to 1.
Others <- which(!(as.matrix(TransFood) ==1 | as.matrix(TransFood) ==0), arr.ind=T )
TransFood[Others] <- 1
TransFood <- as(as.matrix(TransFood), "transactions")
#Load the data for clustering:
Food_by_month <- read.csv('https://xiaoruizhu.github.io/Data-Mining-R/data/qry_Food_by_Month.csv')
summary(Food_by_month)
NickName Oct..10 Nov..10 Dec..10 Jan..11 Feb..11 Mar..11
Alchohol : 1 Min. : 0 Min. : 2.0 Min. : 0.0 Min. : 0.0 Min. : 0.00 Min. : 0.0
Bottled Water : 1 1st Qu.: 39 1st Qu.: 26.0 1st Qu.: 5.5 1st Qu.: 0.0 1st Qu.: 0.00 1st Qu.: 15.0
Burger : 1 Median : 154 Median : 66.0 Median : 56.0 Median : 8.0 Median : 11.00 Median : 48.0
Capri Sun : 1 Mean : 371 Mean :146.4 Mean : 188.2 Mean : 29.0 Mean : 54.55 Mean :150.6
Cheese : 1 3rd Qu.: 524 3rd Qu.:265.5 3rd Qu.: 275.0 3rd Qu.: 50.5 3rd Qu.: 93.50 3rd Qu.:232.5
Cheese Fries Basket: 1 Max. :2002 Max. :597.0 Max. :1089.0 Max. :186.0 Max. :279.00 Max. :785.0
(Other) :49
Groceries <- TransFood
summary(Groceries)
transactions as itemMatrix in sparse format with
19076 rows (elements/itemsets/transactions) and
118 columns (items) and a density of 0.02230729
most frequent items:
Bottled.WaterFood Slice.of.CheeseFood Medium.DrinkFood Small.DrinkFood Slice.of.PeppFood (Other)
3166 3072 2871 2769 2354 35981
element (itemset/transaction) length distribution:
sizes
0 1 2 3 4 5 6 7 8 9 10 11 12 13 15
197 5675 5178 3253 2129 1293 655 351 178 95 42 14 8 7 1
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 1.000 2.000 2.632 4.000 15.000
includes extended item information - examples:
x = Groceries[size(Groceries) > 3]
#inspect(x)
itemFrequencyPlot(Groceries, support = 0.1, cex.names=0.8)

basket_rules <- apriori(Groceries,parameter = list(sup = 0.003, conf = 0.5,target="rules"))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 57
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[115 item(s), 19076 transaction(s)] done [0.01s].
sorting and recoding items ... [74 item(s)] done [0.00s].
creating transaction tree ... done [0.01s].
checking subsets of size 1 2 3 4 done [0.03s].
writing ... [42 rule(s)] done [0.01s].
creating S4 object ... done [0.01s].
summary(basket_rules)
set of 42 rules
rule length distribution (lhs + rhs):sizes
2 3 4
12 27 3
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.000 2.000 3.000 2.786 3.000 4.000
summary of quality measures:
support confidence lift count
Min. :0.003093 Min. :0.5032 Min. : 3.125 Min. : 59.00
1st Qu.:0.003683 1st Qu.:0.5944 1st Qu.: 6.307 1st Qu.: 70.25
Median :0.004561 Median :0.7586 Median : 8.235 Median : 87.00
Mean :0.006836 Mean :0.7404 Mean : 8.929 Mean :130.40
3rd Qu.:0.007496 3rd Qu.:0.8721 3rd Qu.: 9.016 3rd Qu.:143.00
Max. :0.028570 Max. :1.0000 Max. :26.179 Max. :545.00
mining info:
inspect(head(basket_rules))
inspect(subset(basket_rules, size(basket_rules)>4))
inspect(subset(basket_rules, lift>5))
yogurt.rhs <- subset(basket_rules, subset = rhs %in% "Medium.DrinkFood" & lift>3.5)
inspect(yogurt.rhs)
meat.lhs <- subset(basket_rules, subset = lhs %in% "Bottled.WaterFood" & lift>1.5)
inspect(meat.lhs)
#install.packages('arulesViz')
library('arulesViz')
package 㤼㸱arulesViz㤼㸲 was built under R version 3.6.3Loading required package: grid
Registered S3 method overwritten by 'seriation':
method from
reorder.hclust gclus
Registered S3 methods overwritten by 'htmltools':
method from
print.html tools:rstudio
print.shiny.tag tools:rstudio
print.shiny.tag.list tools:rstudio
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
Registered S3 method overwritten by 'data.table':
method from
print.data.table
plot(basket_rules)

plot(basket_rules, interactive=FALSE)
The parameter interactive is deprecated. Use engine='interactive' instead.

plot(head(sort(basket_rules, by="lift"), 10), method = "graph")

plot(basket_rules, method="grouped")
