Data preparation
library(cluster)
library(ggplot2)
library(dendextend)
##
## ---------------------
## Welcome to dendextend version 1.4.0
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
##
## cutree
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(NbClust)
library(kohonen)
## Loading required package: class
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(datasets)
library(heatmaply)
## Loading required package: plotly
##
## Attaching package: 'plotly'
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
## Loading required package: viridis
##
## ---------------------
## Welcome to heatmaply version 0.7.0
## Type ?heatmaply for the main documentation.
## The github page is: https://github.com/talgalili/heatmaply/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/heatmaply/issues
## Or contact: <tal.galili@gmail.com>
##
## To suppress this message use: suppressPackageStartupMessages(library(heatmaply))
## ---------------------
ProductData <- read.csv("products2.csv", na.strings = "undefined",
colClasses = c("character", "character", rep("numeric", 48)))
ProductData$prodid <-NULL
ProdDataSubset <- ProductData[rowSums(is.na(ProductData))<25,unname(colSums(is.na(ProductData)))<1000]
CleanProductData<- ProdDataSubset %>% distinct(prodname, .keep_all = TRUE)
CleanProductData[is.na(CleanProductData)] <- 0
rownames(CleanProductData) <- CleanProductData[,1]
CleanProductData$prodname <- NULL
ScaledProdData <- scale(CleanProductData)
ScaledProdData[is.na(ScaledProdData)] <- 0
stdist <- dist(ScaledProdData, method = "euclidean")
stclusters <- agnes(ScaledProdData, method = "complete", metric = "euclidean")
plot(stclusters, which.plots=2, cex = 0.8)
#Identify the best number of clusters for k-means and perform k-means clustering
#index = kl
bestK1 <- NbClust(ScaledProdData, min.nc=5, max.nc=15, method="kmeans", index = "kl")
bestK1
## $All.index
## 5 6 7 8 9 10 11 12 13 14
## 0.3165 2.1535 0.9688 1.1218 1.1549 0.8349 2.2557 0.6103 1.4139 0.8082
## 15
## 1.3476
##
## $Best.nc
## Number_clusters Value_Index
## 11.0000 2.2557
##
## $Best.partition
## Lucky Charms
## 9
## Rice krispies
## 9
## Froot Loops
## 9
## Cap'n Crunch
## 4
## Special K Protein
## 4
## Sweetened Soy Drink
## 11
## Peanut Butter Crunch
## 4
## Oats & More with Strawberries Cereal
## 5
## Joe's O's
## 4
## Honey Bunches of Oats
## 5
## Cracklin' Oat Bran
## 11
## Large Grade AA 5 Dozen Eggs
## 1
## Spongebob Squarepants Fruity Splash
## 9
## Wavy original
## 6
## Cheerios
## 9
## Ovaltine Rich Chocolate Mix
## 9
## Special K original
## 4
## Very vanilla Soymilk
## 2
## Raisin Nut Bran
## 10
## Crispy Flakes with Red Berries Cereal
## 5
## Mini Trix
## 9
## Special K Red Berries
## 4
## Honey Nut Cheerios
## 9
## Frosted Shredded Wheats
## 3
## Frosted Flakes
## 9
## Grape-Nuts Cereal
## 10
## Count Chocula (family size)
## 9
## Tootie Fruities
## 9
## Apricot
## 8
## Egg Land's best
## 1
## Chocolate Chex
## 4
##
## 9
## Sliced Almonds
## 6
## Kidz Zone Perfect Peanut Butter Chocolate Chip
## 11
## Lightly Salted Cachews
## 6
## life Original
## 4
## Rice Chex
## 4
## Pure Protein Bar
## 7
## Chocolate Chip
## 8
## Crunchy Peanut Butter
## 8
## Chocolate Chip Peanut Crunch
## 8
## White Chocolate Macadamia Nut
## 8
## Coco Puffs
## 9
stclusterk1 <- kmeans(CleanProductData, bestK1$Best.nc[1], nstart = 25)
stclusterk1
## K-means clustering with 11 clusters of sizes 2, 3, 9, 2, 3, 6, 2, 4, 8, 2, 2
##
## Cluster means:
## calcium_serving carbohydrates_serving cholesterol_serving
## 1 0.02000000 0.50000 0.2075
## 2 0.25000000 41.00000 0.0000
## 3 0.06666667 22.44444 0.0000
## 4 0.25000000 44.00000 0.0000
## 5 0.07000000 14.00000 0.0030
## 6 0.01666667 24.66667 0.0000
## 7 0.04000000 7.50000 0.0000
## 8 0.01500000 40.75000 0.0000
## 9 0.09625000 24.37500 0.0000
## 10 0.10000000 15.50000 0.0050
## 11 0.09000000 16.50000 0.0025
## energy_serving energy.from.fat_serving fat_serving fiber_serving
## 1 314.0000 167.00000 4.5000000 0.000000
## 2 1060.0000 251.00000 6.6666667 4.000000
## 3 441.3333 60.66667 1.5000000 1.777778
## 4 983.0000 157.00000 4.2500000 4.500000
## 5 172.6667 7.00000 1.0000000 2.000000
## 6 453.0000 3.50000 0.1666667 1.333333
## 7 732.0000 544.00000 14.0000000 2.000000
## 8 826.5000 21.00000 3.0000000 6.000000
## 9 523.0000 55.12500 1.8125000 1.625000
## 10 795.0000 314.00000 8.0000000 1.000000
## 11 565.0000 188.00000 5.2500000 1.500000
## iron_serving proteins_serving salt_serving saturated.fat_serving
## 1 0.0007200 7.000000 0.1778000 1.2500000
## 2 0.0018000 10.000000 0.5757333 1.5000000
## 3 0.0046000 1.555556 0.4007556 0.2777778
## 4 0.0031500 9.500000 0.3492500 0.7500000
## 5 0.0035400 2.000000 0.2624667 0.1666667
## 6 0.0073500 2.500000 0.4529667 0.0000000
## 7 0.0014400 6.000000 0.0571500 1.7500000
## 8 0.0096750 4.750000 0.3968750 0.8750000
## 9 0.0069975 3.500000 0.3921125 0.1250000
## 10 0.0009000 11.000000 0.4318000 2.2500000
## 11 0.0018900 6.000000 0.1524000 1.7500000
## sodium_serving sugars_serving trans.fat_serving vitamin.a_serving
## 1 0.0700000 0.000000 0 0.0000900000
## 2 0.2266667 21.000000 0 0.0004500000
## 3 0.1577778 8.888889 0 0.0001166667
## 4 0.1375000 23.500000 0 0.0003000000
## 5 0.1033333 4.000000 0 0.0001000000
## 6 0.1783333 6.166667 0 0.0002375000
## 7 0.0225000 1.500000 0 0.0000000000
## 8 0.1562500 11.000000 0 0.0000300000
## 9 0.1543750 8.750000 0 0.0001875000
## 10 0.1700000 1.500000 0 0.0001875000
## 11 0.0600000 10.500000 0 0.0001200000
## vitamin.c_serving
## 1 0.000000000
## 2 0.037800000
## 3 0.005666667
## 4 0.042000000
## 5 0.004000000
## 6 0.014000000
## 7 0.000000000
## 8 0.003750000
## 9 0.006000000
## 10 0.010500000
## 11 0.003000000
##
## Clustering vector:
## Lucky Charms
## 3
## Rice krispies
## 6
## Froot Loops
## 3
## Cap'n Crunch
## 3
## Special K Protein
## 9
## Sweetened Soy Drink
## 11
## Peanut Butter Crunch
## 3
## Oats & More with Strawberries Cereal
## 9
## Joe's O's
## 3
## Honey Bunches of Oats
## 9
## Cracklin' Oat Bran
## 8
## Large Grade AA 5 Dozen Eggs
## 1
## Spongebob Squarepants Fruity Splash
## 3
## Wavy original
## 10
## Cheerios
## 5
## Ovaltine Rich Chocolate Mix
## 5
## Special K original
## 6
## Very vanilla Soymilk
## 9
## Raisin Nut Bran
## 8
## Crispy Flakes with Red Berries Cereal
## 6
## Mini Trix
## 9
## Special K Red Berries
## 6
## Honey Nut Cheerios
## 3
## Frosted Shredded Wheats
## 8
## Frosted Flakes
## 6
## Grape-Nuts Cereal
## 8
## Count Chocula (family size)
## 3
## Tootie Fruities
## 9
## Apricot
## 4
## Egg Land's best
## 1
## Chocolate Chex
## 9
##
## 5
## Sliced Almonds
## 7
## Kidz Zone Perfect Peanut Butter Chocolate Chip
## 11
## Lightly Salted Cachews
## 7
## life Original
## 9
## Rice Chex
## 6
## Pure Protein Bar
## 10
## Chocolate Chip
## 4
## Crunchy Peanut Butter
## 2
## Chocolate Chip Peanut Crunch
## 2
## White Chocolate Macadamia Nut
## 2
## Coco Puffs
## 3
##
## Within cluster sum of squares by cluster:
## [1] 882.6255 1179.1762 7078.5369 2807.7526 11850.8572 5478.8918
## [7] 11478.1357 10319.0001 10882.6743 11638.1585 8853.7502
## (between_SS / total_SS = 97.4 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
name<-c()
kl<-list()
for (i in 1:bestK1$Best.nc[1]){
name<-append(name,paste("group",i,sep=""))
"text"->kl[[i]]
kl[[i]]<-kl[[i]][-1]
}
names(kl)<-name
for (i in 1:length(stclusterk1$cluster)){
for (j in 1:bestK1$Best.nc[1]){
if (stclusterk1$cluster[i]==j) {
kl[[j]]<-append(kl[[j]],names(stclusterk1$cluster[i]))
}}}
kl
## $group1
## [1] " Large Grade AA 5 Dozen Eggs" " Egg Land's best"
##
## $group2
## [1] " Crunchy Peanut Butter" " Chocolate Chip Peanut Crunch"
## [3] " White Chocolate Macadamia Nut"
##
## $group3
## [1] " Lucky Charms"
## [2] " Froot Loops"
## [3] " Cap'n Crunch"
## [4] " Peanut Butter Crunch"
## [5] " Joe's O's"
## [6] " Spongebob Squarepants Fruity Splash"
## [7] " Honey Nut Cheerios"
## [8] " Count Chocula (family size)"
## [9] " Coco Puffs"
##
## $group4
## [1] " Apricot" " Chocolate Chip"
##
## $group5
## [1] " Cheerios" " Ovaltine Rich Chocolate Mix"
## [3] " "
##
## $group6
## [1] " Rice krispies"
## [2] " Special K original"
## [3] " Crispy Flakes with Red Berries Cereal"
## [4] " Special K Red Berries"
## [5] " Frosted Flakes"
## [6] " Rice Chex"
##
## $group7
## [1] " Sliced Almonds" " Lightly Salted Cachews"
##
## $group8
## [1] " Cracklin' Oat Bran" " Raisin Nut Bran"
## [3] " Frosted Shredded Wheats" " Grape-Nuts Cereal"
##
## $group9
## [1] " Special K Protein"
## [2] " Oats & More with Strawberries Cereal"
## [3] " Honey Bunches of Oats"
## [4] " Very vanilla Soymilk"
## [5] " Mini Trix"
## [6] " Tootie Fruities"
## [7] " Chocolate Chex"
## [8] " life Original"
##
## $group10
## [1] " Wavy original" " Pure Protein Bar"
##
## $group11
## [1] " Sweetened Soy Drink"
## [2] " Kidz Zone Perfect Peanut Butter Chocolate Chip"
#Build a Kohonen SOM.
statesom <- som(data=ScaledProdData, grid = somgrid(5, 5, "hexagonal"),radius = quantile(ScaledProdData, 1) * 5)
plot(statesom, type="mapping", labels = rownames(ScaledProdData),cex = 1)
plot(statesom, type="codes")
heatmaply(ScaledProdData, margins = c(40, 130))