library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# --------------------------------------------------
# 1) Setup
# --------------------------------------------------
# Keep results reproducible
set.seed(786)
# Provide the path to your seeds_dataset.txt file
seeds_df <- read.delim("~/Downloads/seeds_dataset.txt", header=FALSE)
file_loc <- "~/Downloads/seeds_dataset.txt"
# Read the seeds data (tab-separated, no header)
# This dataset has 8 columns:
# 1) area
# 2) perimeter
# 3) compactness
# 4) length_of_kernel
# 5) width.of.kernel
# 6) asymetry.coefficient
# 7) length.of.kernel.groove
# 8) type.of.seed
#seeds_dataset <- read.delim("~/Downloads/seeds_dataset.txt", header=FALSE)
# --------------------------------------------------
# 2) Assign Column Names
# --------------------------------------------------
feature_name <- c(
"area",
"perimeter",
"compactness",
"length.of.kernel",
"width.of.kernel",
"asymetry.coefficient",
"length.of.kernel.groove",
"type.of.seed"
)
colnames(seeds_df) <- feature_name
# Quick checks (optional)
str(seeds_df)
## 'data.frame': 210 obs. of 8 variables:
## $ area : num 15.3 14.9 14.3 13.8 16.1 ...
## $ perimeter : num 14.8 14.6 14.1 13.9 15 ...
## $ compactness : num 0.871 0.881 0.905 0.895 0.903 ...
## $ length.of.kernel : num 5.76 5.55 5.29 5.32 5.66 ...
## $ width.of.kernel : num 3.31 3.33 3.34 3.38 3.56 ...
## $ asymetry.coefficient : num 2.22 1.02 2.7 2.26 1.35 ...
## $ length.of.kernel.groove: num 5.22 4.96 4.83 4.8 5.17 ...
## $ type.of.seed : int 1 1 1 1 1 1 1 1 1 1 ...
summary(seeds_df)
## area perimeter compactness length.of.kernel
## Min. :10.59 Min. :12.41 Min. :0.8081 Min. :4.899
## 1st Qu.:12.27 1st Qu.:13.45 1st Qu.:0.8569 1st Qu.:5.262
## Median :14.36 Median :14.32 Median :0.8734 Median :5.524
## Mean :14.85 Mean :14.56 Mean :0.8710 Mean :5.629
## 3rd Qu.:17.30 3rd Qu.:15.71 3rd Qu.:0.8878 3rd Qu.:5.980
## Max. :21.18 Max. :17.25 Max. :0.9183 Max. :6.675
## width.of.kernel asymetry.coefficient length.of.kernel.groove type.of.seed
## Min. :2.630 Min. :0.7651 Min. :4.519 Min. :1
## 1st Qu.:2.944 1st Qu.:2.5615 1st Qu.:5.045 1st Qu.:1
## Median :3.237 Median :3.5990 Median :5.223 Median :2
## Mean :3.259 Mean :3.7002 Mean :5.408 Mean :2
## 3rd Qu.:3.562 3rd Qu.:4.7687 3rd Qu.:5.877 3rd Qu.:3
## Max. :4.033 Max. :8.4560 Max. :6.550 Max. :3
any(is.na(seeds_df))
## [1] FALSE
# --------------------------------------------------
# 3) Remove Rows with Missing Values
# --------------------------------------------------
seeds_df <- na.omit(seeds_df)
# --------------------------------------------------
# 4) Separate Out the True Labels
# --------------------------------------------------
# We'll hold onto these so we can check clustering results later
seeds_label <- seeds_df$type.of.seed
# Remove the type.of.seed column so only numeric features remain
seeds_df$type.of.seed <- NULL
# --------------------------------------------------
# 5) Scale the Data
# --------------------------------------------------
# scale() transforms each numeric column to mean=0, sd=1
seeds_df_sc <- as.data.frame(scale(seeds_df))
summary(seeds_df_sc)
## area perimeter compactness length.of.kernel
## Min. :-1.4632 Min. :-1.6458 Min. :-2.6619 Min. :-1.6466
## 1st Qu.:-0.8858 1st Qu.:-0.8494 1st Qu.:-0.5967 1st Qu.:-0.8267
## Median :-0.1693 Median :-0.1832 Median : 0.1037 Median :-0.2371
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.8446 3rd Qu.: 0.8850 3rd Qu.: 0.7100 3rd Qu.: 0.7927
## Max. : 2.1763 Max. : 2.0603 Max. : 2.0018 Max. : 2.3619
## width.of.kernel asymetry.coefficient length.of.kernel.groove
## Min. :-1.6642 Min. :-1.95210 Min. :-1.8090
## 1st Qu.:-0.8329 1st Qu.:-0.75734 1st Qu.:-0.7387
## Median :-0.0572 Median :-0.06731 Median :-0.3766
## Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.8026 3rd Qu.: 0.71068 3rd Qu.: 0.9541
## Max. : 2.0502 Max. : 3.16303 Max. : 2.3234
# --------------------------------------------------
# 6) Build the Distance Matrix & Hierarchical Clustering
# --------------------------------------------------
# Use Euclidean distance for numeric data
dist_mat <- dist(seeds_df_sc, method = "euclidean")
# Perform hierarchical clustering with "average" linkage
hclust_avg <- hclust(dist_mat, method = "average")
# --------------------------------------------------
# 7) Visualize the Dendrogram
# --------------------------------------------------
plot(hclust_avg) # Basic dendrogram
rect.hclust(hclust_avg, k = 3, border = 2:6)
abline(h = 3, col = "red")
# --------------------------------------------------
# 8) Cut the Tree into 3 Clusters
# --------------------------------------------------
cut_avg <- cutree(hclust_avg, k = 3)
# --------------------------------------------------
# 9) Color the Dendrogram Branches
# --------------------------------------------------
# Install and load dendextend (if not already installed)
if (!require("dendextend")) install.packages("dendextend", dependencies = TRUE)
## Loading required package: dendextend
## Warning: package 'dendextend' was built under R version 4.3.3
##
## ---------------------
## Welcome to dendextend version 1.19.0
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags:
## https://stackoverflow.com/questions/tagged/dendextend
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
##
## cutree
library(dendextend)
avg_dend_obj <- as.dendrogram(hclust_avg)
avg_col_dend <- color_branches(avg_dend_obj, h = 3)
plot(avg_col_dend)
# --------------------------------------------------
# 10) Evaluate Clusters with dplyr
# --------------------------------------------------
if (!require("dplyr")) install.packages("dplyr", dependencies = TRUE)
library(dplyr)
# Put the clusters back into the original numeric dataframe
seeds_df_cl <- mutate(seeds_df, cluster = cut_avg)
# How many observations per cluster?
count(seeds_df_cl, cluster)
## cluster n
## 1 1 65
## 2 2 75
## 3 3 70
# --------------------------------------------------
# 11) Compare Clusters to True Labels
# --------------------------------------------------
table(seeds_df_cl$cluster, seeds_label)
## seeds_label
## 1 2 3
## 1 55 2 8
## 2 7 68 0
## 3 8 0 62
# --------------------------------------------------
# 12) Scatter Plot of area vs. perimeter (Colored by Cluster)
# --------------------------------------------------
if (!require("ggplot2")) install.packages("ggplot2", dependencies = TRUE)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.2
ggplot(seeds_df_cl, aes(x = area, y = perimeter, color = factor(cluster))) +
geom_point()
cm <- table(seeds_df_cl$cluster, seeds_label)
cm
## seeds_label
## 1 2 3
## 1 55 2 8
## 2 7 68 0
## 3 8 0 62
error <- 1-sum(diag(cm))/sum(cm)
error
## [1] 0.1190476
library(cluster)
s <- silhouette(seeds_df_cl$cluster, dist(seeds_df_sc))
plot(s)
d <- dist(scale(seeds_df[,-8]))
methds <- c('complete','single','average')
avgS <- matrix(NA, ncol=3,nrow=5, dimnames = list(2:6,methds))
avgS
## complete single average
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## 6 NA NA NA
for (k in 2:6)
for(m in seq_along(methds)){
h <- hclust(d,meth=methds[m])
c <- cutree(h,k)
s <- silhouette(c,d)
avgS[k-1,m]<-mean(s[,3])
}
avgS
## complete single average
## 2 0.4519948 0.056230166 0.4413386
## 3 0.3501985 -0.005642379 0.3759568
## 4 0.3148568 -0.082752642 0.3548642
## 5 0.2937202 -0.094595852 0.2752309
## 6 0.2173803 -0.229386235 0.2903059
set.seed(1234)
seedKM <- kmeans(seeds_df[,-8], center=3,iter.max = 200)
seedKM
## K-means clustering with 3 clusters of sizes 72, 61, 77
##
## Cluster means:
## area perimeter compactness length.of.kernel width.of.kernel
## 1 14.64847 14.46042 0.8791667 5.563778 3.277903
## 2 18.72180 16.29738 0.8850869 6.208934 3.722672
## 3 11.96442 13.27481 0.8522000 5.229286 2.872922
## asymetry.coefficient length.of.kernel.groove
## 1 2.648933 5.192319
## 2 3.603590 6.066098
## 3 4.759740 5.088519
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 3
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 2 1 3
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 3 3 3 3 1 1 1 1 1 3 2 2 2 2 2 2 2 2 2 2
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
## 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
## 2 2 1 2 1 2 2 2 2 2 2 2 1 1 1 1 2 1 1 1
## 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## 201 202 203 204 205 206 207 208 209 210
## 3 1 3 3 3 3 3 3 3 3
##
## Within cluster sum of squares by cluster:
## [1] 207.4648 184.1086 195.7453
## (between_SS / total_SS = 78.4 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
table(seedKM$cluster, seeds_label)
## seeds_label
## 1 2 3
## 1 60 10 2
## 2 1 60 0
## 3 9 0 68
cm <- table(seedKM$cluster, seeds_label)
errorKM <-1-sum(diag(cm))/sum(cm)
errorKM
## [1] 0.1047619
cm
## seeds_label
## 1 2 3
## 1 60 10 2
## 2 1 60 0
## 3 9 0 68
skm <- silhouette(seedKM$cluster,dist(seeds_df_sc))
plot(skm)
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.