CW02272025

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(ggplot2)

# Load necessary libraries
library(cluster)
install.packages("dendextend", repos = "http://cran.rstudio.com/")

## Installing package into 'C:/Users/yeu3178/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)

## package 'dendextend' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\yeu3178\AppData\Local\Temp\RtmpCm4XWK\downloaded_packages

library(dendextend)

## Warning: package 'dendextend' was built under R version 4.4.3

## 
## ---------------------
## Welcome to dendextend version 1.19.0
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags: 
##   https://stackoverflow.com/questions/tagged/dendextend
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------

## 
## Attaching package: 'dendextend'

## The following object is masked from 'package:stats':
## 
##     cutree

# Set seed for reproducibility
set.seed(786)

# Load dataset correctly (header is FALSE, so we must manually assign column names)
seeds_dataset <- read.delim("C:/Users/yeu3178/Downloads/seeds_dataset.txt", 
                            header = FALSE)

# Define correct column names
features_name <- c('area', 'perimeter', 'compactness', 'length_of_kernel', 
                   'width_of_kernel', 'asymmetry_coefficient', 
                   'length_of_kernel_groove', 'type_of_seed')

# Assign column names to the dataset
colnames(seeds_dataset) <- features_name

# View dataset to confirm correct column names
View(seeds_dataset)

# Store dataset in a variable for processing
sd <- seeds_dataset

# Check structure and summary
summary(sd)

##       area         perimeter      compactness     length_of_kernel
##  Min.   :10.59   Min.   :12.41   Min.   :0.8081   Min.   :4.899   
##  1st Qu.:12.27   1st Qu.:13.45   1st Qu.:0.8569   1st Qu.:5.262   
##  Median :14.36   Median :14.32   Median :0.8734   Median :5.524   
##  Mean   :14.85   Mean   :14.56   Mean   :0.8710   Mean   :5.629   
##  3rd Qu.:17.30   3rd Qu.:15.71   3rd Qu.:0.8878   3rd Qu.:5.980   
##  Max.   :21.18   Max.   :17.25   Max.   :0.9183   Max.   :6.675   
##  width_of_kernel asymmetry_coefficient length_of_kernel_groove  type_of_seed
##  Min.   :2.630   Min.   :0.7651        Min.   :4.519           Min.   :1    
##  1st Qu.:2.944   1st Qu.:2.5615        1st Qu.:5.045           1st Qu.:1    
##  Median :3.237   Median :3.5990        Median :5.223           Median :2    
##  Mean   :3.259   Mean   :3.7002        Mean   :5.408           Mean   :2    
##  3rd Qu.:3.562   3rd Qu.:4.7687        3rd Qu.:5.877           3rd Qu.:3    
##  Max.   :4.033   Max.   :8.4560        Max.   :6.550           Max.   :3

str(sd)

## 'data.frame':    210 obs. of  8 variables:
##  $ area                   : num  15.3 14.9 14.3 13.8 16.1 ...
##  $ perimeter              : num  14.8 14.6 14.1 13.9 15 ...
##  $ compactness            : num  0.871 0.881 0.905 0.895 0.903 ...
##  $ length_of_kernel       : num  5.76 5.55 5.29 5.32 5.66 ...
##  $ width_of_kernel        : num  3.31 3.33 3.34 3.38 3.56 ...
##  $ asymmetry_coefficient  : num  2.22 1.02 2.7 2.26 1.35 ...
##  $ length_of_kernel_groove: num  5.22 4.96 4.83 4.8 5.17 ...
##  $ type_of_seed           : int  1 1 1 1 1 1 1 1 1 1 ...

# Check for missing values
any(is.na(sd))

## [1] FALSE

# Remove missing values (if any exist)
sd <- na.omit(sd)

# Confirm dataset structure after cleaning
str(sd)

## 'data.frame':    210 obs. of  8 variables:
##  $ area                   : num  15.3 14.9 14.3 13.8 16.1 ...
##  $ perimeter              : num  14.8 14.6 14.1 13.9 15 ...
##  $ compactness            : num  0.871 0.881 0.905 0.895 0.903 ...
##  $ length_of_kernel       : num  5.76 5.55 5.29 5.32 5.66 ...
##  $ width_of_kernel        : num  3.31 3.33 3.34 3.38 3.56 ...
##  $ asymmetry_coefficient  : num  2.22 1.02 2.7 2.26 1.35 ...
##  $ length_of_kernel_groove: num  5.22 4.96 4.83 4.8 5.17 ...
##  $ type_of_seed           : int  1 1 1 1 1 1 1 1 1 1 ...

# Store 'type_of_seed' separately before normalization
seeds_label <- sd$type_of_seed

# Remove categorical column before scaling
sd$type_of_seed <- NULL

# Normalize (scale) the dataset
sd_scaled <- as.data.frame(scale(sd))

# Summary of the scaled dataset
summary(sd_scaled)

##       area           perimeter        compactness      length_of_kernel 
##  Min.   :-1.4632   Min.   :-1.6458   Min.   :-2.6619   Min.   :-1.6466  
##  1st Qu.:-0.8858   1st Qu.:-0.8494   1st Qu.:-0.5967   1st Qu.:-0.8267  
##  Median :-0.1693   Median :-0.1832   Median : 0.1037   Median :-0.2371  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.8446   3rd Qu.: 0.8850   3rd Qu.: 0.7100   3rd Qu.: 0.7927  
##  Max.   : 2.1763   Max.   : 2.0603   Max.   : 2.0018   Max.   : 2.3619  
##  width_of_kernel   asymmetry_coefficient length_of_kernel_groove
##  Min.   :-1.6642   Min.   :-1.95210      Min.   :-1.8090        
##  1st Qu.:-0.8329   1st Qu.:-0.75734      1st Qu.:-0.7387        
##  Median :-0.0572   Median :-0.06731      Median :-0.3766        
##  Mean   : 0.0000   Mean   : 0.00000      Mean   : 0.0000        
##  3rd Qu.: 0.8026   3rd Qu.: 0.71068      3rd Qu.: 0.9541        
##  Max.   : 2.0502   Max.   : 3.16303      Max.   : 2.3234

# View the processed dataset
View(sd_scaled)# Set seed for reproducibility
set.seed(786)

# Load dataset correctly (header is FALSE, so we must manually assign column names)
seeds_dataset <- read.delim("C:/Users/yeu3178/Downloads/seeds_dataset.txt", 
                            header = FALSE)

# Define correct column names
features_name <- c('area', 'perimeter', 'compactness', 'length_of_kernel', 
                   'width_of_kernel', 'asymmetry_coefficient', 
                   'length_of_kernel_groove', 'type_of_seed')

# Assign column names to the dataset
colnames(seeds_dataset) <- features_name

# View dataset to confirm correct column names
View(seeds_dataset)

# Store dataset in a variable for processing
sd <- seeds_dataset

# Check structure and summary
summary(sd)

##       area         perimeter      compactness     length_of_kernel
##  Min.   :10.59   Min.   :12.41   Min.   :0.8081   Min.   :4.899   
##  1st Qu.:12.27   1st Qu.:13.45   1st Qu.:0.8569   1st Qu.:5.262   
##  Median :14.36   Median :14.32   Median :0.8734   Median :5.524   
##  Mean   :14.85   Mean   :14.56   Mean   :0.8710   Mean   :5.629   
##  3rd Qu.:17.30   3rd Qu.:15.71   3rd Qu.:0.8878   3rd Qu.:5.980   
##  Max.   :21.18   Max.   :17.25   Max.   :0.9183   Max.   :6.675   
##  width_of_kernel asymmetry_coefficient length_of_kernel_groove  type_of_seed
##  Min.   :2.630   Min.   :0.7651        Min.   :4.519           Min.   :1    
##  1st Qu.:2.944   1st Qu.:2.5615        1st Qu.:5.045           1st Qu.:1    
##  Median :3.237   Median :3.5990        Median :5.223           Median :2    
##  Mean   :3.259   Mean   :3.7002        Mean   :5.408           Mean   :2    
##  3rd Qu.:3.562   3rd Qu.:4.7687        3rd Qu.:5.877           3rd Qu.:3    
##  Max.   :4.033   Max.   :8.4560        Max.   :6.550           Max.   :3

str(sd)

## 'data.frame':    210 obs. of  8 variables:
##  $ area                   : num  15.3 14.9 14.3 13.8 16.1 ...
##  $ perimeter              : num  14.8 14.6 14.1 13.9 15 ...
##  $ compactness            : num  0.871 0.881 0.905 0.895 0.903 ...
##  $ length_of_kernel       : num  5.76 5.55 5.29 5.32 5.66 ...
##  $ width_of_kernel        : num  3.31 3.33 3.34 3.38 3.56 ...
##  $ asymmetry_coefficient  : num  2.22 1.02 2.7 2.26 1.35 ...
##  $ length_of_kernel_groove: num  5.22 4.96 4.83 4.8 5.17 ...
##  $ type_of_seed           : int  1 1 1 1 1 1 1 1 1 1 ...

# Check for missing values
any(is.na(sd))

## [1] FALSE

# Remove missing values (if any exist)
sd <- na.omit(sd)

# Confirm dataset structure after cleaning
str(sd)

## 'data.frame':    210 obs. of  8 variables:
##  $ area                   : num  15.3 14.9 14.3 13.8 16.1 ...
##  $ perimeter              : num  14.8 14.6 14.1 13.9 15 ...
##  $ compactness            : num  0.871 0.881 0.905 0.895 0.903 ...
##  $ length_of_kernel       : num  5.76 5.55 5.29 5.32 5.66 ...
##  $ width_of_kernel        : num  3.31 3.33 3.34 3.38 3.56 ...
##  $ asymmetry_coefficient  : num  2.22 1.02 2.7 2.26 1.35 ...
##  $ length_of_kernel_groove: num  5.22 4.96 4.83 4.8 5.17 ...
##  $ type_of_seed           : int  1 1 1 1 1 1 1 1 1 1 ...

# Store 'type_of_seed' separately before normalization
seeds_label <- sd$type_of_seed

# Remove categorical column before scaling
sd$type_of_seed <- NULL

# Normalize (scale) the dataset
sd_scaled <- as.data.frame(scale(sd))

# Summary of the scaled dataset
summary(sd_scaled)

##       area           perimeter        compactness      length_of_kernel 
##  Min.   :-1.4632   Min.   :-1.6458   Min.   :-2.6619   Min.   :-1.6466  
##  1st Qu.:-0.8858   1st Qu.:-0.8494   1st Qu.:-0.5967   1st Qu.:-0.8267  
##  Median :-0.1693   Median :-0.1832   Median : 0.1037   Median :-0.2371  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.8446   3rd Qu.: 0.8850   3rd Qu.: 0.7100   3rd Qu.: 0.7927  
##  Max.   : 2.1763   Max.   : 2.0603   Max.   : 2.0018   Max.   : 2.3619  
##  width_of_kernel   asymmetry_coefficient length_of_kernel_groove
##  Min.   :-1.6642   Min.   :-1.95210      Min.   :-1.8090        
##  1st Qu.:-0.8329   1st Qu.:-0.75734      1st Qu.:-0.7387        
##  Median :-0.0572   Median :-0.06731      Median :-0.3766        
##  Mean   : 0.0000   Mean   : 0.00000      Mean   : 0.0000        
##  3rd Qu.: 0.8026   3rd Qu.: 0.71068      3rd Qu.: 0.9541        
##  Max.   : 2.0502   Max.   : 3.16303      Max.   : 2.3234

# View the processed dataset
View(sd_scaled)

# Use your dataset (replace 'HeartFailure' with your actual dataset)
data <- sd  # Ensure the dataset contains only numeric columns

# Remove non-numeric columns if needed
data <- data[, sapply(data, is.numeric)]

# Standardize (scale) the data
data_scaled <- scale(data)
# Compute Euclidean distance matrix
dist_matrix <- dist(data_scaled, method = "euclidean")
# Perform hierarchical clustering using the average linkage method
hclust_avg <- hclust(dist_matrix, method = "average")
# Plot the dendrogram
plot(hclust_avg, main = "Cluster Dendrogram", xlab = "", sub = "", cex = 0.6)

# Convert to dendrogram object
dend <- as.dendrogram(hclust_avg)

# Customize the dendrogram appearance
dend <- color_branches(dend, k = 3)  # Color branches by k clusters
plot(dend, main = "Enhanced Cluster Dendrogram")

# Standardize the dataset (assuming 'data' contains numeric values)
data_scaled <- scale(data)

# Compute Euclidean distance matrix
dist_matrix <- dist(data_scaled, method = "euclidean")

# Perform hierarchical clustering using the average linkage method
hclust_avg <- hclust(dist_matrix, method = "average")

# Plot the dendrogram
plot(hclust_avg, main = "Cluster Dendrogram", xlab = "", sub = "", cex = 0.6)

# Add cluster rectangles (highlight 3 clusters)
rect.hclust(hclust_avg, k = 3, border = c("blue", "green", "red"))

# Add a horizontal line at height = 3 for visual separation
abline(h = 3, col = 'red')

# Perform k-means clustering with 3 clusters
set.seed(786)  # For reproducibility
data_cl <- kmeans(data_scaled, centers = 3, nstart = 25)

# Compare cluster assignments with actual seed types
table(data_cl$cluster, seeds_label)

##    seeds_label
##      1  2  3
##   1  2 65  0
##   2 62  5  4
##   3  6  0 66

# Load necessary library

# Ensure data_cl contains necessary columns
data_cl <- data.frame(area = data$area, 
                      perimeter = data$perimeter, 
                      cluster = data_cl$cluster)  # Ensure cluster is assigned

# Scatter plot colored by clusters
ggplot(data_cl, aes(x = area, y = perimeter, color = factor(cluster))) +
  geom_point() +
  labs(title = "Clustering Results", x = "Area", y = "Perimeter", color = "Cluster") +
  theme_minimal()

CW02272025

Chris Yeu

2025-02-27

R Markdown

Including Plots