The data are the percentage employed in different industries in Europe countries during 1979. The purpose of examining this data is to get insight into patterns of employment (if any) amongst European countries in 1970s.
# install.packages("fpc")
library(fpc)
# install.packages("cluster")
library(cluster)
library(factoextra)
E.Jobs <- read.delim("C:/Users/Swagatam/Desktop/Data Mining 2/Case 3/europeanJobs.txt", header = TRUE, sep = "\t", dec = ".")
set.seed(12941211)
id.train <- sample(nrow(E.Jobs),nrow(E.Jobs)*0.90)
E.Jobs.train <- E.Jobs[id.train,]
E.Jobs.test <- E.Jobs[-id.train,]
E.Jobs.train<-cbind.data.frame(E.Jobs.train[,1],scale(E.Jobs.train[,2:10]))
distance <- get_dist(E.Jobs.train)
fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))
fit.3 <- kmeans(E.Jobs.train[,2:10], 3)
plotcluster(E.Jobs.train[,2:10], fit.3$cluster)
fit.4 <- kmeans(E.Jobs.train[,2:10], 4)
plotcluster(E.Jobs.train[,2:10], fit.4$cluster)
#Good Plots
k3 <- kmeans(E.Jobs.train[,2:10], centers = 3)
k4 <- kmeans(E.Jobs.train[,2:10], centers = 4)
k5 <- kmeans(E.Jobs.train[,2:10], centers = 5)
k6 <- kmeans(E.Jobs.train[,2:10], centers = 6)
# plots to compare
p3 <- fviz_cluster(k3, geom = "point", data = E.Jobs.train[,2:10]) + ggtitle("k = 3")
p4 <- fviz_cluster(k4, geom = "point", data = E.Jobs.train[,2:10]) + ggtitle("k = 4")
p5 <- fviz_cluster(k5, geom = "point", data = E.Jobs.train[,2:10]) + ggtitle("k = 5")
p6 <- fviz_cluster(k6, geom = "point", data = E.Jobs.train[,2:10]) + ggtitle("k = 6")
library(gridExtra)
grid.arrange(p3,p4, p5, p6, nrow = 2)
#Best K Value
wss <- (nrow(E.Jobs.train[,2:10])-1)*sum(apply(E.Jobs.train[,2:10],2,var))
for (i in 1:10) wss[i] <- sum(kmeans(E.Jobs.train[,2:10],
centers=i)$withinss)
plot(1:10, wss, type="b", xlab="Number of Clusters",ylab="Within groups sum of squares")
rownames(E.Jobs.train) <- E.Jobs.train[,1]
# More complex
clusplot(E.Jobs.train[,2:10], fit.3$cluster, color = TRUE, shade = TRUE,
labels = 2, lines = 0)
aggregate(E.Jobs.train[,2:10],by=list(fit.3$cluster),FUN=mean)
## Group.1 Agr Min Man PS Con
## 1 1 0.3417911 0.9723572 0.42209799 0.20428271 0.06895053
## 2 2 -0.5127831 -0.4572541 0.07283881 0.02074746 0.31545762
## 3 3 2.3932130 -0.2024718 -1.98721465 -0.86022171 -2.44953022
## SI Fin SPS TC
## 1 -0.8997691 -1.0295815 -0.4831731 0.3903700
## 2 0.6794530 0.4099321 0.4666803 0.1074562
## 3 -1.6069790 0.7340110 -1.5756563 -2.1184886
# Hierarchical clustering
hc_result <- hclust(dist(E.Jobs.train[,2:10]))
plot(hc_result,labels = E.Jobs.train[,1])
sub_grp <- cutree(hc_result, k = 3)
# fviz_cluster(list(data = E.Jobs.train[,2:10], cluster = sub_grp))
#Cut Dendrogram into 3 Clusters
rect.hclust(hc_result, k = 3,border = 2:5)
fviz_nbclust(E.Jobs.train, FUN = hcut, method = "wss")
library(readxl)
library(factoextra)
library(MASS)
library(readxl)
library(arules)
library(arulesViz)
library(knitr)
TransFood <- read.csv('https://xiaoruizhu.github.io/Data-Mining-R/data/food_4_association.csv')
TransFood <- TransFood[, -1]
# Find out elements that are not equal to 0 or 1 and change them to 1.
Others <- which(!(as.matrix(TransFood) ==1 | as.matrix(TransFood) ==0), arr.ind=T )
TransFood[Others] <- 1
TransFood <- as(as.matrix(TransFood), "transactions")
itemFrequencyPlot(TransFood, support = 0.1, cex.names=0.8)
basket_rules <- apriori(TransFood,parameter = list(sup = 0.003, conf = 0.5,target="rules"))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.5 0.1 1 none FALSE TRUE 5 0.003 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 57
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[115 item(s), 19076 transaction(s)] done [0.01s].
## sorting and recoding items ... [74 item(s)] done [0.00s].
## creating transaction tree ... done [0.01s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [42 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
inspect(head(basket_rules))
## lhs rhs support confidence lift count
## [1] {Small.Pink.LemonadeFood} => {Chicken.Nugget.BasketFood} 0.003355001 0.5925926 16.034463 64
## [2] {Grilled.Chicken.SandwichFood} => {French.Fries.BasketFood} 0.003721954 0.6698113 6.862149 71
## [3] {FloatFood} => {Ice.Cream.ConeFood} 0.007024533 0.7089947 6.355631 134
## [4] {Side.of.CheeseFood} => {Cheese.ConeyFood} 0.004665548 0.6846154 25.912149 89
## [5] {Side.of.CheeseFood} => {Hot.DogFood} 0.006290627 0.9230769 21.605663 120
## [6] {BurgerFood} => {French.Fries.BasketFood} 0.004613126 0.6616541 6.778579 88
inspect(subset(basket_rules, size(basket_rules)>3))
## lhs rhs support confidence lift count
## [1] {Krazy.KritterFood,
## Medium.DrinkFood,
## Slice.of.PeppFood} => {Slice.of.CheeseFood} 0.003250157 0.5535714 3.437477 62
## [2] {Medium.DrinkFood,
## Slice.of.PeppFood,
## Small.DrinkFood} => {Slice.of.CheeseFood} 0.003145313 0.6000000 3.725781 60
## [3] {Medium.DrinkFood,
## Slice.of.CheeseFood,
## Small.DrinkFood} => {Slice.of.PeppFood} 0.003145313 0.5172414 4.191545 60
###strongest relationship
inspect(subset(basket_rules,lift>10&confidence>0.6))
## lhs rhs support confidence lift count
## [1] {Side.of.CheeseFood} => {Cheese.ConeyFood} 0.004665548 0.6846154 25.91215 89
## [2] {Side.of.CheeseFood} => {Hot.DogFood} 0.006290627 0.9230769 21.60566 120
## [3] {Cheese.ConeyFood,
## Side.of.CheeseFood} => {Hot.DogFood} 0.004351017 0.9325843 21.82819 83
## [4] {Hot.DogFood,
## Side.of.CheeseFood} => {Cheese.ConeyFood} 0.004351017 0.6916667 26.17903 83
inspect(subset(basket_rules,support>0.006))
## lhs rhs support confidence lift count
## [1] {FloatFood} => {Ice.Cream.ConeFood} 0.007024533 0.7089947 6.355631 134
## [2] {Side.of.CheeseFood} => {Hot.DogFood} 0.006290627 0.9230769 21.605663 120
## [3] {SandwichFood} => {French.Fries.BasketFood} 0.007653596 0.6822430 6.989510 146
## [4] {Hot.Chocolate.Souvenir.RefillFood} => {Hot.Chocolate.SouvenirFood} 0.014992661 0.5596869 13.180972 286
## [5] {ToppingFood} => {Ice.Cream.ConeFood} 0.028569931 0.9981685 8.947868 545
## [6] {Add.CheeseFood} => {Soft.Pretzel..3_39Food} 0.019133990 0.6965649 7.601643 365
## [7] {Chicken.TendersFood} => {French.Fries.BasketFood} 0.017299224 0.7586207 7.771992 330
## [8] {CheeseburgerFood} => {French.Fries.BasketFood} 0.016879849 0.7931034 8.125264 322
## [9] {ChipsFood,
## Slice.of.PeppFood} => {Slice.of.CheeseFood} 0.008282659 0.5808824 3.607068 158
## [10] {GatoradeFood,
## Slice.of.PeppFood} => {Slice.of.CheeseFood} 0.010117425 0.5830816 3.620724 193
## [11] {Slice.of.PeppFood,
## Souvenir.DrinkFood} => {Slice.of.CheeseFood} 0.008125393 0.5032468 3.124979 155
## [12] {Medium.DrinkFood,
## Slice.of.PeppFood} => {Slice.of.CheeseFood} 0.013629692 0.5273834 3.274858 260
## [13] {Bottled.WaterFood,
## Slice.of.PeppFood} => {Slice.of.CheeseFood} 0.010694066 0.5151515 3.198903 204
inspect(subset(basket_rules,confidence>0.6))
## lhs rhs support confidence lift count
## [1] {Grilled.Chicken.SandwichFood} => {French.Fries.BasketFood} 0.003721954 0.6698113 6.862149 71
## [2] {FloatFood} => {Ice.Cream.ConeFood} 0.007024533 0.7089947 6.355631 134
## [3] {Side.of.CheeseFood} => {Cheese.ConeyFood} 0.004665548 0.6846154 25.912149 89
## [4] {Side.of.CheeseFood} => {Hot.DogFood} 0.006290627 0.9230769 21.605663 120
## [5] {BurgerFood} => {French.Fries.BasketFood} 0.004613126 0.6616541 6.778579 88
## [6] {SandwichFood} => {French.Fries.BasketFood} 0.007653596 0.6822430 6.989510 146
## [7] {ToppingFood} => {Ice.Cream.ConeFood} 0.028569931 0.9981685 8.947868 545
## [8] {Add.CheeseFood} => {Soft.Pretzel..3_39Food} 0.019133990 0.6965649 7.601643 365
## [9] {Chicken.TendersFood} => {French.Fries.BasketFood} 0.017299224 0.7586207 7.771992 330
## [10] {CheeseburgerFood} => {French.Fries.BasketFood} 0.016879849 0.7931034 8.125264 322
## [11] {Cheese.ConeyFood,
## Side.of.CheeseFood} => {Hot.DogFood} 0.004351017 0.9325843 21.828193 83
## [12] {Hot.DogFood,
## Side.of.CheeseFood} => {Cheese.ConeyFood} 0.004351017 0.6916667 26.179034 83
## [13] {Bottled.WaterFood,
## ToppingFood} => {Ice.Cream.ConeFood} 0.004036486 1.0000000 8.964286 77
## [14] {Add.CheeseFood,
## Bottled.WaterFood} => {Soft.Pretzel..3_39Food} 0.003826798 0.8021978 8.754419 73
## [15] {CheeseburgerFood,
## Chicken.TendersFood} => {French.Fries.BasketFood} 0.003931642 0.9615385 9.850863 75
## [16] {Chicken.TendersFood,
## Souvenir.DrinkFood} => {French.Fries.BasketFood} 0.003197735 0.7922078 8.116088 61
## [17] {Chicken.TendersFood,
## Krazy.KritterFood} => {French.Fries.BasketFood} 0.005661564 0.9557522 9.791584 108
## [18] {Chicken.TendersFood,
## Slice.of.PeppFood} => {French.Fries.BasketFood} 0.003669532 0.9210526 9.436090 70
## [19] {Chicken.TendersFood,
## Small.DrinkFood} => {French.Fries.BasketFood} 0.004822814 0.8214286 8.415452 92
## [20] {Chicken.TendersFood,
## Medium.DrinkFood} => {French.Fries.BasketFood} 0.004141329 0.8144330 8.343783 79
## [21] {Bottled.WaterFood,
## Chicken.TendersFood} => {French.Fries.BasketFood} 0.003459845 0.7586207 7.771992 66
## [22] {Chicken.TendersFood,
## Slice.of.CheeseFood} => {French.Fries.BasketFood} 0.005399455 0.8728814 8.942580 103
## [23] {CheeseburgerFood,
## Souvenir.DrinkFood} => {French.Fries.BasketFood} 0.003250157 0.9117647 9.340936 62
## [24] {CheeseburgerFood,
## Krazy.KritterFood} => {French.Fries.BasketFood} 0.005451877 0.8813559 9.029402 104
## [25] {CheeseburgerFood,
## Slice.of.PeppFood} => {French.Fries.BasketFood} 0.003721954 0.8658537 8.870582 71
## [26] {CheeseburgerFood,
## Small.DrinkFood} => {French.Fries.BasketFood} 0.004141329 0.8315789 8.519441 79
## [27] {CheeseburgerFood,
## Medium.DrinkFood} => {French.Fries.BasketFood} 0.005189767 0.8761062 8.975619 99
## [28] {Bottled.WaterFood,
## CheeseburgerFood} => {French.Fries.BasketFood} 0.003092892 0.7662338 7.849987 59
## [29] {CheeseburgerFood,
## Slice.of.CheeseFood} => {French.Fries.BasketFood} 0.005242189 0.8695652 8.908607 100
## [30] {Hot.DogFood,
## Krazy.KritterFood} => {French.Fries.BasketFood} 0.003669532 0.6140351 6.290727 70
###++++Bottled WaterFood
BottledWater.lhs <- subset(basket_rules, subset = (lhs %in% "Bottled.WaterFood")&(lift>5))
inspect(BottledWater.lhs)
## lhs rhs support confidence lift count
## [1] {Bottled.WaterFood,
## ToppingFood} => {Ice.Cream.ConeFood} 0.004036486 1.0000000 8.964286 77
## [2] {Add.CheeseFood,
## Bottled.WaterFood} => {Soft.Pretzel..3_39Food} 0.003826798 0.8021978 8.754419 73
## [3] {Bottled.WaterFood,
## Chicken.TendersFood} => {French.Fries.BasketFood} 0.003459845 0.7586207 7.771992 66
## [4] {Bottled.WaterFood,
## CheeseburgerFood} => {French.Fries.BasketFood} 0.003092892 0.7662338 7.849987 59
#####Bottled Water and ToppingFood
BottledWater2.lhs <- subset(basket_rules, subset = (lhs %in% c("Bottled.WaterFood","ToppingFood"))&(lift>5))
inspect(BottledWater2.lhs)
## lhs rhs support confidence lift count
## [1] {ToppingFood} => {Ice.Cream.ConeFood} 0.028569931 0.9981685 8.947868 545
## [2] {Bottled.WaterFood,
## ToppingFood} => {Ice.Cream.ConeFood} 0.004036486 1.0000000 8.964286 77
## [3] {Add.CheeseFood,
## Bottled.WaterFood} => {Soft.Pretzel..3_39Food} 0.003826798 0.8021978 8.754419 73
## [4] {Bottled.WaterFood,
## Chicken.TendersFood} => {French.Fries.BasketFood} 0.003459845 0.7586207 7.771992 66
## [5] {Bottled.WaterFood,
## CheeseburgerFood} => {French.Fries.BasketFood} 0.003092892 0.7662338 7.849987 59
####+++Slice of CheeseFood LHS
Slice.Cheese.lhs <- subset(basket_rules, subset = (lhs%in% "Slice.of.CheeseFood")&(lift>5) )
inspect(Slice.Cheese.lhs)
## lhs rhs support confidence lift count
## [1] {Chicken.TendersFood,
## Slice.of.CheeseFood} => {French.Fries.BasketFood} 0.005399455 0.8728814 8.942580 103
## [2] {CheeseburgerFood,
## Slice.of.CheeseFood} => {French.Fries.BasketFood} 0.005242189 0.8695652 8.908607 100
#RHS
Slice.Cheese.rhs <- subset(basket_rules, subset = (rhs%in% "Slice.of.CheeseFood")&(lift>5) )
inspect(Slice.Cheese.rhs)
It can be observed that there’s very high confidence (almost 100%) that if we buy bottled water food and toppingfood, we buy ice cream cone food as well
Similar Analysis can be done for other products as we based on confidence, lift and support
There are 42 rules that satisfy the above criterion. The size varies from 2 to a maximum of 4. The confidence vs support plot is shown below
#Plots
library('arulesViz')
plot(basket_rules)
plot(head(sort(basket_rules, by="lift"), 10), method = "graph")
plot(head(sort(basket_rules, by="lift"), 10), method = "grouped")