library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# --------------------------------------------------
# 1) Setup
# --------------------------------------------------
# Keep results reproducible
set.seed(786)
# Provide the path to your seeds_dataset.txt file
seeds_df <- read.delim("~/Downloads/seeds_dataset.txt", header=FALSE)
file_loc <- "~/Downloads/seeds_dataset.txt"
# Read the seeds data (tab-separated, no header)
# This dataset has 8 columns:
# 1) area
# 2) perimeter
# 3) compactness
# 4) length_of_kernel
# 5) width.of.kernel
# 6) asymetry.coefficient
# 7) length.of.kernel.groove
# 8) type.of.seed
#seeds_dataset <- read.delim("~/Downloads/seeds_dataset.txt", header=FALSE)
# --------------------------------------------------
# 2) Assign Column Names
# --------------------------------------------------
feature_name <- c(
"area",
"perimeter",
"compactness",
"length.of.kernel",
"width.of.kernel",
"asymetry.coefficient",
"length.of.kernel.groove",
"type.of.seed"
)
colnames(seeds_df) <- feature_name
# Quick checks (optional)
str(seeds_df)
## 'data.frame': 210 obs. of 8 variables:
## $ area : num 15.3 14.9 14.3 13.8 16.1 ...
## $ perimeter : num 14.8 14.6 14.1 13.9 15 ...
## $ compactness : num 0.871 0.881 0.905 0.895 0.903 ...
## $ length.of.kernel : num 5.76 5.55 5.29 5.32 5.66 ...
## $ width.of.kernel : num 3.31 3.33 3.34 3.38 3.56 ...
## $ asymetry.coefficient : num 2.22 1.02 2.7 2.26 1.35 ...
## $ length.of.kernel.groove: num 5.22 4.96 4.83 4.8 5.17 ...
## $ type.of.seed : int 1 1 1 1 1 1 1 1 1 1 ...
summary(seeds_df)
## area perimeter compactness length.of.kernel
## Min. :10.59 Min. :12.41 Min. :0.8081 Min. :4.899
## 1st Qu.:12.27 1st Qu.:13.45 1st Qu.:0.8569 1st Qu.:5.262
## Median :14.36 Median :14.32 Median :0.8734 Median :5.524
## Mean :14.85 Mean :14.56 Mean :0.8710 Mean :5.629
## 3rd Qu.:17.30 3rd Qu.:15.71 3rd Qu.:0.8878 3rd Qu.:5.980
## Max. :21.18 Max. :17.25 Max. :0.9183 Max. :6.675
## width.of.kernel asymetry.coefficient length.of.kernel.groove type.of.seed
## Min. :2.630 Min. :0.7651 Min. :4.519 Min. :1
## 1st Qu.:2.944 1st Qu.:2.5615 1st Qu.:5.045 1st Qu.:1
## Median :3.237 Median :3.5990 Median :5.223 Median :2
## Mean :3.259 Mean :3.7002 Mean :5.408 Mean :2
## 3rd Qu.:3.562 3rd Qu.:4.7687 3rd Qu.:5.877 3rd Qu.:3
## Max. :4.033 Max. :8.4560 Max. :6.550 Max. :3
any(is.na(seeds_df))
## [1] FALSE
# --------------------------------------------------
# 3) Remove Rows with Missing Values
# --------------------------------------------------
seeds_df <- na.omit(seeds_df)
# --------------------------------------------------
# 4) Separate Out the True Labels
# --------------------------------------------------
# We'll hold onto these so we can check clustering results later
seeds_label <- seeds_df$type.of.seed
# Remove the type.of.seed column so only numeric features remain
seeds_df$type.of.seed <- NULL
# --------------------------------------------------
# 5) Scale the Data
# --------------------------------------------------
# scale() transforms each numeric column to mean=0, sd=1
seeds_df_sc <- as.data.frame(scale(seeds_df))
summary(seeds_df_sc)
## area perimeter compactness length.of.kernel
## Min. :-1.4632 Min. :-1.6458 Min. :-2.6619 Min. :-1.6466
## 1st Qu.:-0.8858 1st Qu.:-0.8494 1st Qu.:-0.5967 1st Qu.:-0.8267
## Median :-0.1693 Median :-0.1832 Median : 0.1037 Median :-0.2371
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.8446 3rd Qu.: 0.8850 3rd Qu.: 0.7100 3rd Qu.: 0.7927
## Max. : 2.1763 Max. : 2.0603 Max. : 2.0018 Max. : 2.3619
## width.of.kernel asymetry.coefficient length.of.kernel.groove
## Min. :-1.6642 Min. :-1.95210 Min. :-1.8090
## 1st Qu.:-0.8329 1st Qu.:-0.75734 1st Qu.:-0.7387
## Median :-0.0572 Median :-0.06731 Median :-0.3766
## Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.8026 3rd Qu.: 0.71068 3rd Qu.: 0.9541
## Max. : 2.0502 Max. : 3.16303 Max. : 2.3234
# --------------------------------------------------
# 6) Build the Distance Matrix & Hierarchical Clustering
# --------------------------------------------------
# Use Euclidean distance for numeric data
dist_mat <- dist(seeds_df_sc, method = "euclidean")
# Perform hierarchical clustering with "average" linkage
hclust_avg <- hclust(dist_mat, method = "average")
# --------------------------------------------------
# 7) Visualize the Dendrogram
# --------------------------------------------------
plot(hclust_avg) # Basic dendrogram
rect.hclust(hclust_avg, k = 3, border = 2:6)
abline(h = 3, col = "red")
# --------------------------------------------------
# 8) Cut the Tree into 3 Clusters
# --------------------------------------------------
cut_avg <- cutree(hclust_avg, k = 3)
# --------------------------------------------------
# 9) Color the Dendrogram Branches
# --------------------------------------------------
# Install and load dendextend (if not already installed)
if (!require("dendextend")) install.packages("dendextend", dependencies = TRUE)
## Loading required package: dendextend
## Warning: package 'dendextend' was built under R version 4.3.3
##
## ---------------------
## Welcome to dendextend version 1.19.0
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags:
## https://stackoverflow.com/questions/tagged/dendextend
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
##
## cutree
library(dendextend)
avg_dend_obj <- as.dendrogram(hclust_avg)
avg_col_dend <- color_branches(avg_dend_obj, h = 3)
plot(avg_col_dend)
# --------------------------------------------------
# 10) Evaluate Clusters with dplyr
# --------------------------------------------------
if (!require("dplyr")) install.packages("dplyr", dependencies = TRUE)
library(dplyr)
# Put the clusters back into the original numeric dataframe
seeds_df_cl <- mutate(seeds_df, cluster = cut_avg)
# How many observations per cluster?
count(seeds_df_cl, cluster)
## cluster n
## 1 1 65
## 2 2 75
## 3 3 70
# --------------------------------------------------
# 11) Compare Clusters to True Labels
# --------------------------------------------------
table(seeds_df_cl$cluster, seeds_label)
## seeds_label
## 1 2 3
## 1 55 2 8
## 2 7 68 0
## 3 8 0 62
# --------------------------------------------------
# 12) Scatter Plot of area vs. perimeter (Colored by Cluster)
# --------------------------------------------------
if (!require("ggplot2")) install.packages("ggplot2", dependencies = TRUE)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.2
ggplot(seeds_df_cl, aes(x = area, y = perimeter, color = factor(cluster))) +
geom_point()
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.