This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
# Load necessary libraries
library(dendextend)
## Warning: package 'dendextend' was built under R version 4.4.3
##
## ---------------------
## Welcome to dendextend version 1.19.0
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags:
## https://stackoverflow.com/questions/tagged/dendextend
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
##
## cutree
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(cluster)
## Warning: package 'cluster' was built under R version 4.4.3
# Set seed for reproducibility
set.seed(786)
# Load dataset
seeds_dataset <- read.delim("C:/Users/otuata4438/Downloads/seeds_dataset.txt", header=FALSE)
sd <- seeds_dataset # Assign dataset to sd
# Corrected feature names
feature_names <- c('area', 'perimeter', 'compactness', 'length_of_kernel',
'width_of_kernel', 'asymmetry_coefficient',
'length_of_kernel_groove', 'type_of_seed')
colnames(sd) <- feature_names
str(sd)
## 'data.frame': 210 obs. of 8 variables:
## $ area : num 15.3 14.9 14.3 13.8 16.1 ...
## $ perimeter : num 14.8 14.6 14.1 13.9 15 ...
## $ compactness : num 0.871 0.881 0.905 0.895 0.903 ...
## $ length_of_kernel : num 5.76 5.55 5.29 5.32 5.66 ...
## $ width_of_kernel : num 3.31 3.33 3.34 3.38 3.56 ...
## $ asymmetry_coefficient : num 2.22 1.02 2.7 2.26 1.35 ...
## $ length_of_kernel_groove: num 5.22 4.96 4.83 4.8 5.17 ...
## $ type_of_seed : int 1 1 1 1 1 1 1 1 1 1 ...
# Check for missing values
summary(sd)
## area perimeter compactness length_of_kernel
## Min. :10.59 Min. :12.41 Min. :0.8081 Min. :4.899
## 1st Qu.:12.27 1st Qu.:13.45 1st Qu.:0.8569 1st Qu.:5.262
## Median :14.36 Median :14.32 Median :0.8734 Median :5.524
## Mean :14.85 Mean :14.56 Mean :0.8710 Mean :5.629
## 3rd Qu.:17.30 3rd Qu.:15.71 3rd Qu.:0.8878 3rd Qu.:5.980
## Max. :21.18 Max. :17.25 Max. :0.9183 Max. :6.675
## width_of_kernel asymmetry_coefficient length_of_kernel_groove type_of_seed
## Min. :2.630 Min. :0.7651 Min. :4.519 Min. :1
## 1st Qu.:2.944 1st Qu.:2.5615 1st Qu.:5.045 1st Qu.:1
## Median :3.237 Median :3.5990 Median :5.223 Median :2
## Mean :3.259 Mean :3.7002 Mean :5.408 Mean :2
## 3rd Qu.:3.562 3rd Qu.:4.7687 3rd Qu.:5.877 3rd Qu.:3
## Max. :4.033 Max. :8.4560 Max. :6.550 Max. :3
any(is.na(sd))
## [1] FALSE
# Remove missing values if any
sd <- na.omit(sd)
str(sd)
## 'data.frame': 210 obs. of 8 variables:
## $ area : num 15.3 14.9 14.3 13.8 16.1 ...
## $ perimeter : num 14.8 14.6 14.1 13.9 15 ...
## $ compactness : num 0.871 0.881 0.905 0.895 0.903 ...
## $ length_of_kernel : num 5.76 5.55 5.29 5.32 5.66 ...
## $ width_of_kernel : num 3.31 3.33 3.34 3.38 3.56 ...
## $ asymmetry_coefficient : num 2.22 1.02 2.7 2.26 1.35 ...
## $ length_of_kernel_groove: num 5.22 4.96 4.83 4.8 5.17 ...
## $ type_of_seed : int 1 1 1 1 1 1 1 1 1 1 ...
# Extract seed labels
seeds_label <- sd$type_of_seed
sd$type_of_seed <- NULL # Remove label column from feature set
# Scale the data
sd_sc <- as.data.frame(scale(sd))
summary(sd_sc)
## area perimeter compactness length_of_kernel
## Min. :-1.4632 Min. :-1.6458 Min. :-2.6619 Min. :-1.6466
## 1st Qu.:-0.8858 1st Qu.:-0.8494 1st Qu.:-0.5967 1st Qu.:-0.8267
## Median :-0.1693 Median :-0.1832 Median : 0.1037 Median :-0.2371
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.8446 3rd Qu.: 0.8850 3rd Qu.: 0.7100 3rd Qu.: 0.7927
## Max. : 2.1763 Max. : 2.0603 Max. : 2.0018 Max. : 2.3619
## width_of_kernel asymmetry_coefficient length_of_kernel_groove
## Min. :-1.6642 Min. :-1.95210 Min. :-1.8090
## 1st Qu.:-0.8329 1st Qu.:-0.75734 1st Qu.:-0.7387
## Median :-0.0572 Median :-0.06731 Median :-0.3766
## Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.8026 3rd Qu.: 0.71068 3rd Qu.: 0.9541
## Max. : 2.0502 Max. : 3.16303 Max. : 2.3234
# Compute distance matrix (Fixed "euclidian" typo)
dist_mat <- dist(sd_sc, method= "euclidean")
# Perform hierarchical clustering
hclust_avg <- hclust(dist_mat, method= 'average')
# Plot dendrogram
plot(hclust_avg)
# Cut tree into 3 clusters
cut_avg <- cutree(hclust_avg, k = 3)
# Plot hierarchical clustering results with colored clusters
plot(hclust_avg)
rect.hclust(hclust_avg, k = 3, border = 2:6)
abline(h = 3, col = "red")
# Color dendrogram branches
avg_dend_obj <- as.dendrogram(hclust_avg)
avg_col_dend <- color_branches(avg_dend_obj, h = 3)
plot(avg_col_dend)
# Assign clusters to dataset
sd_cl <- mutate(sd, cluster = cut_avg)
# Count number of observations per cluster
count(sd_cl, cluster)
## cluster n
## 1 1 65
## 2 2 75
## 3 3 70
# Plot clusters using ggplot
ggplot(sd_cl, aes(x = area, y = perimeter, color = factor(cluster))) +
geom_point()
# Create confusion matrix (Fixed reference to correct variable)
cm <- table(sd_cl$cluster, seeds_label)
cm
## seeds_label
## 1 2 3
## 1 55 2 8
## 2 7 68 0
## 3 8 0 62
# Compute error rate (Fixed `sum(sm)`, changed to `sum(cm)`)
error <- 1 - sum(diag(cm)) / sum(cm)
error
## [1] 0.1190476
# Compute silhouette scores
s <- silhouette(sd_cl$cluster, dist(sd_sc))
plot(s)
# Compare different clustering methods
d <- dist(scale(sd[, -8]))
methods <- c("complete", "single", "average") # Fixed "averge" typo
# Create an empty matrix to store silhouette averages
avgs <- matrix(NA, ncol = 3, nrow = 5, dimnames = list(2:6, methods))
avgs
## complete single average
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## 6 NA NA NA
# Loop through different cluster sizes and linkage methods
for (k in 2:6) {
for (m in seq_along(methods)) { # Fixed typo "methtods" → "methods"
h <- hclust(d, method = methods[m]) # Specify 'method' argument
c <- cutree(h, k)
s <- silhouette(c, d)
avgs[k - 1, m] <- mean(s[, 3])
}
}
avgs # Display silhouette averages
## complete single average
## 2 0.4519948 0.056230166 0.4413386
## 3 0.3501985 -0.005642379 0.3759568
## 4 0.3148568 -0.082752642 0.3548642
## 5 0.2937202 -0.094595852 0.2752309
## 6 0.2173803 -0.229386235 0.2903059
# Perform K-Means clustering
set.seed(1234)
seedKM <- kmeans(sd[, -8], centers = 3, iter.max = 200)
seedKM # Display results
## K-means clustering with 3 clusters of sizes 72, 61, 77
##
## Cluster means:
## area perimeter compactness length_of_kernel width_of_kernel
## 1 14.64847 14.46042 0.8791667 5.563778 3.277903
## 2 18.72180 16.29738 0.8850869 6.208934 3.722672
## 3 11.96442 13.27481 0.8522000 5.229286 2.872922
## asymmetry_coefficient length_of_kernel_groove
## 1 2.648933 5.192319
## 2 3.603590 6.066098
## 3 4.759740 5.088519
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 3
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 2 1 3
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 3 3 3 3 1 1 1 1 1 3 2 2 2 2 2 2 2 2 2 2
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
## 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
## 2 2 1 2 1 2 2 2 2 2 2 2 1 1 1 1 2 1 1 1
## 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## 201 202 203 204 205 206 207 208 209 210
## 3 1 3 3 3 3 3 3 3 3
##
## Within cluster sum of squares by cluster:
## [1] 207.4648 184.1086 195.7453
## (between_SS / total_SS = 78.4 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# Create confusion matrix for K-Means (Fixed "seedslable" typo)
cm <- table(seedKM$cluster, seeds_label)
cm
## seeds_label
## 1 2 3
## 1 60 10 2
## 2 1 60 0
## 3 9 0 68
# Compute error rate for K-Means
errorKM <- 1 - sum(diag(cm)) / sum(cm)
errorKM
## [1] 0.1047619
You can also embed plots, for example:
Note that the echo = FALSE
parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.