CW22525

library(readr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# --------------------------------------------------
# 1) Setup
# --------------------------------------------------
# Keep results reproducible
set.seed(786)

# Provide the path to your seeds_dataset.txt file
seeds_df <- read.delim("~/Downloads/seeds_dataset.txt", header=FALSE)
file_loc <- "~/Downloads/seeds_dataset.txt"

# Read the seeds data (tab-separated, no header)
# This dataset has 8 columns:
#   1) area
#   2) perimeter
#   3) compactness
#   4) length_of_kernel
#   5) width.of.kernel
#   6) asymetry.coefficient
#   7) length.of.kernel.groove
#   8) type.of.seed
#seeds_dataset <- read.delim("~/Downloads/seeds_dataset.txt", header=FALSE)
# --------------------------------------------------
# 2) Assign Column Names
# --------------------------------------------------
feature_name <- c(
  "area",
  "perimeter",
  "compactness",
  "length.of.kernel",
  "width.of.kernel",
  "asymetry.coefficient",
  "length.of.kernel.groove",
  "type.of.seed"
)
colnames(seeds_df) <- feature_name

# Quick checks (optional)
str(seeds_df)

## 'data.frame':    210 obs. of  8 variables:
##  $ area                   : num  15.3 14.9 14.3 13.8 16.1 ...
##  $ perimeter              : num  14.8 14.6 14.1 13.9 15 ...
##  $ compactness            : num  0.871 0.881 0.905 0.895 0.903 ...
##  $ length.of.kernel       : num  5.76 5.55 5.29 5.32 5.66 ...
##  $ width.of.kernel        : num  3.31 3.33 3.34 3.38 3.56 ...
##  $ asymetry.coefficient   : num  2.22 1.02 2.7 2.26 1.35 ...
##  $ length.of.kernel.groove: num  5.22 4.96 4.83 4.8 5.17 ...
##  $ type.of.seed           : int  1 1 1 1 1 1 1 1 1 1 ...

summary(seeds_df)

##       area         perimeter      compactness     length.of.kernel
##  Min.   :10.59   Min.   :12.41   Min.   :0.8081   Min.   :4.899   
##  1st Qu.:12.27   1st Qu.:13.45   1st Qu.:0.8569   1st Qu.:5.262   
##  Median :14.36   Median :14.32   Median :0.8734   Median :5.524   
##  Mean   :14.85   Mean   :14.56   Mean   :0.8710   Mean   :5.629   
##  3rd Qu.:17.30   3rd Qu.:15.71   3rd Qu.:0.8878   3rd Qu.:5.980   
##  Max.   :21.18   Max.   :17.25   Max.   :0.9183   Max.   :6.675   
##  width.of.kernel asymetry.coefficient length.of.kernel.groove  type.of.seed
##  Min.   :2.630   Min.   :0.7651       Min.   :4.519           Min.   :1    
##  1st Qu.:2.944   1st Qu.:2.5615       1st Qu.:5.045           1st Qu.:1    
##  Median :3.237   Median :3.5990       Median :5.223           Median :2    
##  Mean   :3.259   Mean   :3.7002       Mean   :5.408           Mean   :2    
##  3rd Qu.:3.562   3rd Qu.:4.7687       3rd Qu.:5.877           3rd Qu.:3    
##  Max.   :4.033   Max.   :8.4560       Max.   :6.550           Max.   :3

any(is.na(seeds_df))

## [1] FALSE

# --------------------------------------------------
# 3) Remove Rows with Missing Values
# --------------------------------------------------
seeds_df <- na.omit(seeds_df)

# --------------------------------------------------
# 4) Separate Out the True Labels
# --------------------------------------------------
# We'll hold onto these so we can check clustering results later
seeds_label <- seeds_df$type.of.seed

# Remove the type.of.seed column so only numeric features remain
seeds_df$type.of.seed <- NULL

# --------------------------------------------------
# 5) Scale the Data
# --------------------------------------------------
# scale() transforms each numeric column to mean=0, sd=1
seeds_df_sc <- as.data.frame(scale(seeds_df))
summary(seeds_df_sc)

##       area           perimeter        compactness      length.of.kernel 
##  Min.   :-1.4632   Min.   :-1.6458   Min.   :-2.6619   Min.   :-1.6466  
##  1st Qu.:-0.8858   1st Qu.:-0.8494   1st Qu.:-0.5967   1st Qu.:-0.8267  
##  Median :-0.1693   Median :-0.1832   Median : 0.1037   Median :-0.2371  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.8446   3rd Qu.: 0.8850   3rd Qu.: 0.7100   3rd Qu.: 0.7927  
##  Max.   : 2.1763   Max.   : 2.0603   Max.   : 2.0018   Max.   : 2.3619  
##  width.of.kernel   asymetry.coefficient length.of.kernel.groove
##  Min.   :-1.6642   Min.   :-1.95210     Min.   :-1.8090        
##  1st Qu.:-0.8329   1st Qu.:-0.75734     1st Qu.:-0.7387        
##  Median :-0.0572   Median :-0.06731     Median :-0.3766        
##  Mean   : 0.0000   Mean   : 0.00000     Mean   : 0.0000        
##  3rd Qu.: 0.8026   3rd Qu.: 0.71068     3rd Qu.: 0.9541        
##  Max.   : 2.0502   Max.   : 3.16303     Max.   : 2.3234

# --------------------------------------------------
# 6) Build the Distance Matrix & Hierarchical Clustering
# --------------------------------------------------
# Use Euclidean distance for numeric data
dist_mat <- dist(seeds_df_sc, method = "euclidean")

# Perform hierarchical clustering with "average" linkage
hclust_avg <- hclust(dist_mat, method = "average")

# --------------------------------------------------
# 7) Visualize the Dendrogram
# --------------------------------------------------
plot(hclust_avg)                    # Basic dendrogram
rect.hclust(hclust_avg, k = 3, border = 2:6)
abline(h = 3, col = "red")

# --------------------------------------------------
# 8) Cut the Tree into 3 Clusters
# --------------------------------------------------
cut_avg <- cutree(hclust_avg, k = 3)

# --------------------------------------------------
# 9) Color the Dendrogram Branches
# --------------------------------------------------
# Install and load dendextend (if not already installed)
if (!require("dendextend")) install.packages("dendextend", dependencies = TRUE)

## Loading required package: dendextend

## Warning: package 'dendextend' was built under R version 4.3.3

## 
## ---------------------
## Welcome to dendextend version 1.19.0
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags: 
##   https://stackoverflow.com/questions/tagged/dendextend
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------

## 
## Attaching package: 'dendextend'

## The following object is masked from 'package:stats':
## 
##     cutree

library(dendextend)

avg_dend_obj <- as.dendrogram(hclust_avg)
avg_col_dend <- color_branches(avg_dend_obj, h = 3)
plot(avg_col_dend)

# --------------------------------------------------
# 10) Evaluate Clusters with dplyr
# --------------------------------------------------
if (!require("dplyr")) install.packages("dplyr", dependencies = TRUE)
library(dplyr)

# Put the clusters back into the original numeric dataframe
seeds_df_cl <- mutate(seeds_df, cluster = cut_avg)

# How many observations per cluster?
count(seeds_df_cl, cluster)

##   cluster  n
## 1       1 65
## 2       2 75
## 3       3 70

# --------------------------------------------------
# 11) Compare Clusters to True Labels
# --------------------------------------------------
table(seeds_df_cl$cluster, seeds_label)

##    seeds_label
##      1  2  3
##   1 55  2  8
##   2  7 68  0
##   3  8  0 62

# --------------------------------------------------
# 12) Scatter Plot of area vs. perimeter (Colored by Cluster)
# --------------------------------------------------
if (!require("ggplot2")) install.packages("ggplot2", dependencies = TRUE)

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 4.3.2

ggplot(seeds_df_cl, aes(x = area, y = perimeter, color = factor(cluster))) +
  geom_point()

cm <- table(seeds_df_cl$cluster, seeds_label)
cm

##    seeds_label
##      1  2  3
##   1 55  2  8
##   2  7 68  0
##   3  8  0 62

error <- 1-sum(diag(cm))/sum(cm)
error

## [1] 0.1190476

library(cluster)
s <- silhouette(seeds_df_cl$cluster, dist(seeds_df_sc))
plot(s)

d <- dist(scale(seeds_df[,-8]))
methds <- c('complete','single','average')
avgS <- matrix(NA, ncol=3,nrow=5, dimnames = list(2:6,methds))
avgS

##   complete single average
## 2       NA     NA      NA
## 3       NA     NA      NA
## 4       NA     NA      NA
## 5       NA     NA      NA
## 6       NA     NA      NA

for (k in 2:6)
  for(m in seq_along(methds)){
    h <- hclust(d,meth=methds[m])
    c <- cutree(h,k)
    s <- silhouette(c,d)
    avgS[k-1,m]<-mean(s[,3])
  }
avgS

##    complete       single   average
## 2 0.4519948  0.056230166 0.4413386
## 3 0.3501985 -0.005642379 0.3759568
## 4 0.3148568 -0.082752642 0.3548642
## 5 0.2937202 -0.094595852 0.2752309
## 6 0.2173803 -0.229386235 0.2903059

set.seed(1234)
seedKM <- kmeans(seeds_df[,-8], center=3,iter.max = 200)
seedKM

## K-means clustering with 3 clusters of sizes 72, 61, 77
## 
## Cluster means:
##       area perimeter compactness length.of.kernel width.of.kernel
## 1 14.64847  14.46042   0.8791667         5.563778        3.277903
## 2 18.72180  16.29738   0.8850869         6.208934        3.722672
## 3 11.96442  13.27481   0.8522000         5.229286        2.872922
##   asymetry.coefficient length.of.kernel.groove
## 1             2.648933                5.192319
## 2             3.603590                6.066098
## 3             4.759740                5.088519
## 
## Clustering vector:
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   3   1   1   3 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
##   1   1   1   1   1   1   3   1   1   1   1   1   1   1   1   1   1   2   1   3 
##  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
##   3   3   3   3   1   1   1   1   1   3   2   2   2   2   2   2   2   2   2   2 
##  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
##   1   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
##   2   2   1   2   1   2   2   2   2   2   2   2   1   1   1   1   2   1   1   1 
## 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 
##   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3 
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 
##   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   1 
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 
##   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3 
## 201 202 203 204 205 206 207 208 209 210 
##   3   1   3   3   3   3   3   3   3   3 
## 
## Within cluster sum of squares by cluster:
## [1] 207.4648 184.1086 195.7453
##  (between_SS / total_SS =  78.4 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

table(seedKM$cluster, seeds_label)

##    seeds_label
##      1  2  3
##   1 60 10  2
##   2  1 60  0
##   3  9  0 68

cm <- table(seedKM$cluster, seeds_label)
errorKM <-1-sum(diag(cm))/sum(cm)
errorKM

## [1] 0.1047619

cm

##    seeds_label
##      1  2  3
##   1 60 10  2
##   2  1 60  0
##   3  9  0 68

skm <- silhouette(seedKM$cluster,dist(seeds_df_sc))
plot(skm)

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

CW22525

Deveon Wright

2025-03-04

R Markdown