R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

# Load necessary libraries


library(dendextend)
## Warning: package 'dendextend' was built under R version 4.4.3
## 
## ---------------------
## Welcome to dendextend version 1.19.0
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags: 
##   https://stackoverflow.com/questions/tagged/dendextend
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
## 
##     cutree
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(cluster)
## Warning: package 'cluster' was built under R version 4.4.3
# Set seed for reproducibility
set.seed(786)

# Load dataset
seeds_dataset <- read.delim("C:/Users/otuata4438/Downloads/seeds_dataset.txt", header=FALSE)
sd <- seeds_dataset  # Assign dataset to sd

# Corrected feature names
feature_names <- c('area', 'perimeter', 'compactness', 'length_of_kernel', 
                   'width_of_kernel', 'asymmetry_coefficient', 
                   'length_of_kernel_groove', 'type_of_seed')

colnames(sd) <- feature_names
str(sd)
## 'data.frame':    210 obs. of  8 variables:
##  $ area                   : num  15.3 14.9 14.3 13.8 16.1 ...
##  $ perimeter              : num  14.8 14.6 14.1 13.9 15 ...
##  $ compactness            : num  0.871 0.881 0.905 0.895 0.903 ...
##  $ length_of_kernel       : num  5.76 5.55 5.29 5.32 5.66 ...
##  $ width_of_kernel        : num  3.31 3.33 3.34 3.38 3.56 ...
##  $ asymmetry_coefficient  : num  2.22 1.02 2.7 2.26 1.35 ...
##  $ length_of_kernel_groove: num  5.22 4.96 4.83 4.8 5.17 ...
##  $ type_of_seed           : int  1 1 1 1 1 1 1 1 1 1 ...
# Check for missing values
summary(sd)
##       area         perimeter      compactness     length_of_kernel
##  Min.   :10.59   Min.   :12.41   Min.   :0.8081   Min.   :4.899   
##  1st Qu.:12.27   1st Qu.:13.45   1st Qu.:0.8569   1st Qu.:5.262   
##  Median :14.36   Median :14.32   Median :0.8734   Median :5.524   
##  Mean   :14.85   Mean   :14.56   Mean   :0.8710   Mean   :5.629   
##  3rd Qu.:17.30   3rd Qu.:15.71   3rd Qu.:0.8878   3rd Qu.:5.980   
##  Max.   :21.18   Max.   :17.25   Max.   :0.9183   Max.   :6.675   
##  width_of_kernel asymmetry_coefficient length_of_kernel_groove  type_of_seed
##  Min.   :2.630   Min.   :0.7651        Min.   :4.519           Min.   :1    
##  1st Qu.:2.944   1st Qu.:2.5615        1st Qu.:5.045           1st Qu.:1    
##  Median :3.237   Median :3.5990        Median :5.223           Median :2    
##  Mean   :3.259   Mean   :3.7002        Mean   :5.408           Mean   :2    
##  3rd Qu.:3.562   3rd Qu.:4.7687        3rd Qu.:5.877           3rd Qu.:3    
##  Max.   :4.033   Max.   :8.4560        Max.   :6.550           Max.   :3
any(is.na(sd))
## [1] FALSE
# Remove missing values if any
sd <- na.omit(sd)
str(sd)
## 'data.frame':    210 obs. of  8 variables:
##  $ area                   : num  15.3 14.9 14.3 13.8 16.1 ...
##  $ perimeter              : num  14.8 14.6 14.1 13.9 15 ...
##  $ compactness            : num  0.871 0.881 0.905 0.895 0.903 ...
##  $ length_of_kernel       : num  5.76 5.55 5.29 5.32 5.66 ...
##  $ width_of_kernel        : num  3.31 3.33 3.34 3.38 3.56 ...
##  $ asymmetry_coefficient  : num  2.22 1.02 2.7 2.26 1.35 ...
##  $ length_of_kernel_groove: num  5.22 4.96 4.83 4.8 5.17 ...
##  $ type_of_seed           : int  1 1 1 1 1 1 1 1 1 1 ...
# Extract seed labels
seeds_label <- sd$type_of_seed
sd$type_of_seed <- NULL  # Remove label column from feature set

# Scale the data
sd_sc <- as.data.frame(scale(sd))
summary(sd_sc)
##       area           perimeter        compactness      length_of_kernel 
##  Min.   :-1.4632   Min.   :-1.6458   Min.   :-2.6619   Min.   :-1.6466  
##  1st Qu.:-0.8858   1st Qu.:-0.8494   1st Qu.:-0.5967   1st Qu.:-0.8267  
##  Median :-0.1693   Median :-0.1832   Median : 0.1037   Median :-0.2371  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.8446   3rd Qu.: 0.8850   3rd Qu.: 0.7100   3rd Qu.: 0.7927  
##  Max.   : 2.1763   Max.   : 2.0603   Max.   : 2.0018   Max.   : 2.3619  
##  width_of_kernel   asymmetry_coefficient length_of_kernel_groove
##  Min.   :-1.6642   Min.   :-1.95210      Min.   :-1.8090        
##  1st Qu.:-0.8329   1st Qu.:-0.75734      1st Qu.:-0.7387        
##  Median :-0.0572   Median :-0.06731      Median :-0.3766        
##  Mean   : 0.0000   Mean   : 0.00000      Mean   : 0.0000        
##  3rd Qu.: 0.8026   3rd Qu.: 0.71068      3rd Qu.: 0.9541        
##  Max.   : 2.0502   Max.   : 3.16303      Max.   : 2.3234
# Compute distance matrix (Fixed "euclidian" typo)
dist_mat <- dist(sd_sc, method= "euclidean")

# Perform hierarchical clustering
hclust_avg <- hclust(dist_mat, method= 'average')

# Plot dendrogram
plot(hclust_avg)

# Cut tree into 3 clusters
cut_avg <- cutree(hclust_avg, k = 3)

# Plot hierarchical clustering results with colored clusters
plot(hclust_avg)
rect.hclust(hclust_avg, k = 3, border = 2:6)
abline(h = 3, col = "red")

# Color dendrogram branches
avg_dend_obj <- as.dendrogram(hclust_avg)
avg_col_dend <- color_branches(avg_dend_obj, h = 3)
plot(avg_col_dend)

# Assign clusters to dataset
sd_cl <- mutate(sd, cluster = cut_avg)

# Count number of observations per cluster
count(sd_cl, cluster)
##   cluster  n
## 1       1 65
## 2       2 75
## 3       3 70
# Plot clusters using ggplot
ggplot(sd_cl, aes(x = area, y = perimeter, color = factor(cluster))) +
  geom_point()

# Create confusion matrix (Fixed reference to correct variable)
cm <- table(sd_cl$cluster, seeds_label)
cm
##    seeds_label
##      1  2  3
##   1 55  2  8
##   2  7 68  0
##   3  8  0 62
# Compute error rate (Fixed `sum(sm)`, changed to `sum(cm)`)
error <- 1 - sum(diag(cm)) / sum(cm)
error
## [1] 0.1190476
# Compute silhouette scores
s <- silhouette(sd_cl$cluster, dist(sd_sc))
plot(s)

# Compare different clustering methods
d <- dist(scale(sd[, -8]))
methods <- c("complete", "single", "average")  # Fixed "averge" typo

# Create an empty matrix to store silhouette averages
avgs <- matrix(NA, ncol = 3, nrow = 5, dimnames = list(2:6, methods))
avgs
##   complete single average
## 2       NA     NA      NA
## 3       NA     NA      NA
## 4       NA     NA      NA
## 5       NA     NA      NA
## 6       NA     NA      NA
# Loop through different cluster sizes and linkage methods
for (k in 2:6) {
  for (m in seq_along(methods)) {  # Fixed typo "methtods" → "methods"
    h <- hclust(d, method = methods[m])  # Specify 'method' argument
    c <- cutree(h, k)
    s <- silhouette(c, d)
    avgs[k - 1, m] <- mean(s[, 3])
  }
}

avgs  # Display silhouette averages
##    complete       single   average
## 2 0.4519948  0.056230166 0.4413386
## 3 0.3501985 -0.005642379 0.3759568
## 4 0.3148568 -0.082752642 0.3548642
## 5 0.2937202 -0.094595852 0.2752309
## 6 0.2173803 -0.229386235 0.2903059
# Perform K-Means clustering
set.seed(1234)
seedKM <- kmeans(sd[, -8], centers = 3, iter.max = 200)
seedKM  # Display results
## K-means clustering with 3 clusters of sizes 72, 61, 77
## 
## Cluster means:
##       area perimeter compactness length_of_kernel width_of_kernel
## 1 14.64847  14.46042   0.8791667         5.563778        3.277903
## 2 18.72180  16.29738   0.8850869         6.208934        3.722672
## 3 11.96442  13.27481   0.8522000         5.229286        2.872922
##   asymmetry_coefficient length_of_kernel_groove
## 1              2.648933                5.192319
## 2              3.603590                6.066098
## 3              4.759740                5.088519
## 
## Clustering vector:
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   3   1   1   3 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
##   1   1   1   1   1   1   3   1   1   1   1   1   1   1   1   1   1   2   1   3 
##  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
##   3   3   3   3   1   1   1   1   1   3   2   2   2   2   2   2   2   2   2   2 
##  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
##   1   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
##   2   2   1   2   1   2   2   2   2   2   2   2   1   1   1   1   2   1   1   1 
## 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 
##   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3 
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 
##   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   1 
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 
##   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3 
## 201 202 203 204 205 206 207 208 209 210 
##   3   1   3   3   3   3   3   3   3   3 
## 
## Within cluster sum of squares by cluster:
## [1] 207.4648 184.1086 195.7453
##  (between_SS / total_SS =  78.4 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
# Create confusion matrix for K-Means (Fixed "seedslable" typo)
cm <- table(seedKM$cluster, seeds_label)
cm
##    seeds_label
##      1  2  3
##   1 60 10  2
##   2  1 60  0
##   3  9  0 68
# Compute error rate for K-Means
errorKM <- 1 - sum(diag(cm)) / sum(cm)
errorKM
## [1] 0.1047619

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.