library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# --------------------------------------------------
# 1) Setup
# --------------------------------------------------
# Keep results reproducible
set.seed(786)

# Provide the path to your seeds_dataset.txt file
seeds_df <- read.delim("~/Downloads/seeds_dataset.txt", header=FALSE)
file_loc <- "~/Downloads/seeds_dataset.txt"

# Read the seeds data (tab-separated, no header)
# This dataset has 8 columns:
#   1) area
#   2) perimeter
#   3) compactness
#   4) length_of_kernel
#   5) width.of.kernel
#   6) asymetry.coefficient
#   7) length.of.kernel.groove
#   8) type.of.seed
#seeds_dataset <- read.delim("~/Downloads/seeds_dataset.txt", header=FALSE)
# --------------------------------------------------
# 2) Assign Column Names
# --------------------------------------------------
feature_name <- c(
  "area",
  "perimeter",
  "compactness",
  "length.of.kernel",
  "width.of.kernel",
  "asymetry.coefficient",
  "length.of.kernel.groove",
  "type.of.seed"
)
colnames(seeds_df) <- feature_name

# Quick checks (optional)
str(seeds_df)
## 'data.frame':    210 obs. of  8 variables:
##  $ area                   : num  15.3 14.9 14.3 13.8 16.1 ...
##  $ perimeter              : num  14.8 14.6 14.1 13.9 15 ...
##  $ compactness            : num  0.871 0.881 0.905 0.895 0.903 ...
##  $ length.of.kernel       : num  5.76 5.55 5.29 5.32 5.66 ...
##  $ width.of.kernel        : num  3.31 3.33 3.34 3.38 3.56 ...
##  $ asymetry.coefficient   : num  2.22 1.02 2.7 2.26 1.35 ...
##  $ length.of.kernel.groove: num  5.22 4.96 4.83 4.8 5.17 ...
##  $ type.of.seed           : int  1 1 1 1 1 1 1 1 1 1 ...
summary(seeds_df)
##       area         perimeter      compactness     length.of.kernel
##  Min.   :10.59   Min.   :12.41   Min.   :0.8081   Min.   :4.899   
##  1st Qu.:12.27   1st Qu.:13.45   1st Qu.:0.8569   1st Qu.:5.262   
##  Median :14.36   Median :14.32   Median :0.8734   Median :5.524   
##  Mean   :14.85   Mean   :14.56   Mean   :0.8710   Mean   :5.629   
##  3rd Qu.:17.30   3rd Qu.:15.71   3rd Qu.:0.8878   3rd Qu.:5.980   
##  Max.   :21.18   Max.   :17.25   Max.   :0.9183   Max.   :6.675   
##  width.of.kernel asymetry.coefficient length.of.kernel.groove  type.of.seed
##  Min.   :2.630   Min.   :0.7651       Min.   :4.519           Min.   :1    
##  1st Qu.:2.944   1st Qu.:2.5615       1st Qu.:5.045           1st Qu.:1    
##  Median :3.237   Median :3.5990       Median :5.223           Median :2    
##  Mean   :3.259   Mean   :3.7002       Mean   :5.408           Mean   :2    
##  3rd Qu.:3.562   3rd Qu.:4.7687       3rd Qu.:5.877           3rd Qu.:3    
##  Max.   :4.033   Max.   :8.4560       Max.   :6.550           Max.   :3
any(is.na(seeds_df))
## [1] FALSE
# --------------------------------------------------
# 3) Remove Rows with Missing Values
# --------------------------------------------------
seeds_df <- na.omit(seeds_df)

# --------------------------------------------------
# 4) Separate Out the True Labels
# --------------------------------------------------
# We'll hold onto these so we can check clustering results later
seeds_label <- seeds_df$type.of.seed

# Remove the type.of.seed column so only numeric features remain
seeds_df$type.of.seed <- NULL

# --------------------------------------------------
# 5) Scale the Data
# --------------------------------------------------
# scale() transforms each numeric column to mean=0, sd=1
seeds_df_sc <- as.data.frame(scale(seeds_df))
summary(seeds_df_sc)
##       area           perimeter        compactness      length.of.kernel 
##  Min.   :-1.4632   Min.   :-1.6458   Min.   :-2.6619   Min.   :-1.6466  
##  1st Qu.:-0.8858   1st Qu.:-0.8494   1st Qu.:-0.5967   1st Qu.:-0.8267  
##  Median :-0.1693   Median :-0.1832   Median : 0.1037   Median :-0.2371  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.8446   3rd Qu.: 0.8850   3rd Qu.: 0.7100   3rd Qu.: 0.7927  
##  Max.   : 2.1763   Max.   : 2.0603   Max.   : 2.0018   Max.   : 2.3619  
##  width.of.kernel   asymetry.coefficient length.of.kernel.groove
##  Min.   :-1.6642   Min.   :-1.95210     Min.   :-1.8090        
##  1st Qu.:-0.8329   1st Qu.:-0.75734     1st Qu.:-0.7387        
##  Median :-0.0572   Median :-0.06731     Median :-0.3766        
##  Mean   : 0.0000   Mean   : 0.00000     Mean   : 0.0000        
##  3rd Qu.: 0.8026   3rd Qu.: 0.71068     3rd Qu.: 0.9541        
##  Max.   : 2.0502   Max.   : 3.16303     Max.   : 2.3234
# --------------------------------------------------
# 6) Build the Distance Matrix & Hierarchical Clustering
# --------------------------------------------------
# Use Euclidean distance for numeric data
dist_mat <- dist(seeds_df_sc, method = "euclidean")

# Perform hierarchical clustering with "average" linkage
hclust_avg <- hclust(dist_mat, method = "average")

# --------------------------------------------------
# 7) Visualize the Dendrogram
# --------------------------------------------------
plot(hclust_avg)                    # Basic dendrogram
rect.hclust(hclust_avg, k = 3, border = 2:6)
abline(h = 3, col = "red")

# --------------------------------------------------
# 8) Cut the Tree into 3 Clusters
# --------------------------------------------------
cut_avg <- cutree(hclust_avg, k = 3)

# --------------------------------------------------
# 9) Color the Dendrogram Branches
# --------------------------------------------------
# Install and load dendextend (if not already installed)
if (!require("dendextend")) install.packages("dendextend", dependencies = TRUE)
## Loading required package: dendextend
## Warning: package 'dendextend' was built under R version 4.3.3
## 
## ---------------------
## Welcome to dendextend version 1.19.0
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags: 
##   https://stackoverflow.com/questions/tagged/dendextend
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
## 
##     cutree
library(dendextend)

avg_dend_obj <- as.dendrogram(hclust_avg)
avg_col_dend <- color_branches(avg_dend_obj, h = 3)
plot(avg_col_dend)

# --------------------------------------------------
# 10) Evaluate Clusters with dplyr
# --------------------------------------------------
if (!require("dplyr")) install.packages("dplyr", dependencies = TRUE)
library(dplyr)

# Put the clusters back into the original numeric dataframe
seeds_df_cl <- mutate(seeds_df, cluster = cut_avg)

# How many observations per cluster?
count(seeds_df_cl, cluster)
##   cluster  n
## 1       1 65
## 2       2 75
## 3       3 70
# --------------------------------------------------
# 11) Compare Clusters to True Labels
# --------------------------------------------------
table(seeds_df_cl$cluster, seeds_label)
##    seeds_label
##      1  2  3
##   1 55  2  8
##   2  7 68  0
##   3  8  0 62
# --------------------------------------------------
# 12) Scatter Plot of area vs. perimeter (Colored by Cluster)
# --------------------------------------------------
if (!require("ggplot2")) install.packages("ggplot2", dependencies = TRUE)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.2
ggplot(seeds_df_cl, aes(x = area, y = perimeter, color = factor(cluster))) +
  geom_point()

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.