This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(ggplot2)
# Load necessary libraries
library(cluster)
install.packages("dendextend", repos = "http://cran.rstudio.com/")
## Installing package into 'C:/Users/yeu3178/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'dendextend' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\yeu3178\AppData\Local\Temp\RtmpCm4XWK\downloaded_packages
library(dendextend)
## Warning: package 'dendextend' was built under R version 4.4.3
##
## ---------------------
## Welcome to dendextend version 1.19.0
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags:
## https://stackoverflow.com/questions/tagged/dendextend
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
##
## cutree
# Set seed for reproducibility
set.seed(786)
# Load dataset correctly (header is FALSE, so we must manually assign column names)
seeds_dataset <- read.delim("C:/Users/yeu3178/Downloads/seeds_dataset.txt",
header = FALSE)
# Define correct column names
features_name <- c('area', 'perimeter', 'compactness', 'length_of_kernel',
'width_of_kernel', 'asymmetry_coefficient',
'length_of_kernel_groove', 'type_of_seed')
# Assign column names to the dataset
colnames(seeds_dataset) <- features_name
# View dataset to confirm correct column names
View(seeds_dataset)
# Store dataset in a variable for processing
sd <- seeds_dataset
# Check structure and summary
summary(sd)
## area perimeter compactness length_of_kernel
## Min. :10.59 Min. :12.41 Min. :0.8081 Min. :4.899
## 1st Qu.:12.27 1st Qu.:13.45 1st Qu.:0.8569 1st Qu.:5.262
## Median :14.36 Median :14.32 Median :0.8734 Median :5.524
## Mean :14.85 Mean :14.56 Mean :0.8710 Mean :5.629
## 3rd Qu.:17.30 3rd Qu.:15.71 3rd Qu.:0.8878 3rd Qu.:5.980
## Max. :21.18 Max. :17.25 Max. :0.9183 Max. :6.675
## width_of_kernel asymmetry_coefficient length_of_kernel_groove type_of_seed
## Min. :2.630 Min. :0.7651 Min. :4.519 Min. :1
## 1st Qu.:2.944 1st Qu.:2.5615 1st Qu.:5.045 1st Qu.:1
## Median :3.237 Median :3.5990 Median :5.223 Median :2
## Mean :3.259 Mean :3.7002 Mean :5.408 Mean :2
## 3rd Qu.:3.562 3rd Qu.:4.7687 3rd Qu.:5.877 3rd Qu.:3
## Max. :4.033 Max. :8.4560 Max. :6.550 Max. :3
str(sd)
## 'data.frame': 210 obs. of 8 variables:
## $ area : num 15.3 14.9 14.3 13.8 16.1 ...
## $ perimeter : num 14.8 14.6 14.1 13.9 15 ...
## $ compactness : num 0.871 0.881 0.905 0.895 0.903 ...
## $ length_of_kernel : num 5.76 5.55 5.29 5.32 5.66 ...
## $ width_of_kernel : num 3.31 3.33 3.34 3.38 3.56 ...
## $ asymmetry_coefficient : num 2.22 1.02 2.7 2.26 1.35 ...
## $ length_of_kernel_groove: num 5.22 4.96 4.83 4.8 5.17 ...
## $ type_of_seed : int 1 1 1 1 1 1 1 1 1 1 ...
# Check for missing values
any(is.na(sd))
## [1] FALSE
# Remove missing values (if any exist)
sd <- na.omit(sd)
# Confirm dataset structure after cleaning
str(sd)
## 'data.frame': 210 obs. of 8 variables:
## $ area : num 15.3 14.9 14.3 13.8 16.1 ...
## $ perimeter : num 14.8 14.6 14.1 13.9 15 ...
## $ compactness : num 0.871 0.881 0.905 0.895 0.903 ...
## $ length_of_kernel : num 5.76 5.55 5.29 5.32 5.66 ...
## $ width_of_kernel : num 3.31 3.33 3.34 3.38 3.56 ...
## $ asymmetry_coefficient : num 2.22 1.02 2.7 2.26 1.35 ...
## $ length_of_kernel_groove: num 5.22 4.96 4.83 4.8 5.17 ...
## $ type_of_seed : int 1 1 1 1 1 1 1 1 1 1 ...
# Store 'type_of_seed' separately before normalization
seeds_label <- sd$type_of_seed
# Remove categorical column before scaling
sd$type_of_seed <- NULL
# Normalize (scale) the dataset
sd_scaled <- as.data.frame(scale(sd))
# Summary of the scaled dataset
summary(sd_scaled)
## area perimeter compactness length_of_kernel
## Min. :-1.4632 Min. :-1.6458 Min. :-2.6619 Min. :-1.6466
## 1st Qu.:-0.8858 1st Qu.:-0.8494 1st Qu.:-0.5967 1st Qu.:-0.8267
## Median :-0.1693 Median :-0.1832 Median : 0.1037 Median :-0.2371
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.8446 3rd Qu.: 0.8850 3rd Qu.: 0.7100 3rd Qu.: 0.7927
## Max. : 2.1763 Max. : 2.0603 Max. : 2.0018 Max. : 2.3619
## width_of_kernel asymmetry_coefficient length_of_kernel_groove
## Min. :-1.6642 Min. :-1.95210 Min. :-1.8090
## 1st Qu.:-0.8329 1st Qu.:-0.75734 1st Qu.:-0.7387
## Median :-0.0572 Median :-0.06731 Median :-0.3766
## Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.8026 3rd Qu.: 0.71068 3rd Qu.: 0.9541
## Max. : 2.0502 Max. : 3.16303 Max. : 2.3234
# View the processed dataset
View(sd_scaled)# Set seed for reproducibility
set.seed(786)
# Load dataset correctly (header is FALSE, so we must manually assign column names)
seeds_dataset <- read.delim("C:/Users/yeu3178/Downloads/seeds_dataset.txt",
header = FALSE)
# Define correct column names
features_name <- c('area', 'perimeter', 'compactness', 'length_of_kernel',
'width_of_kernel', 'asymmetry_coefficient',
'length_of_kernel_groove', 'type_of_seed')
# Assign column names to the dataset
colnames(seeds_dataset) <- features_name
# View dataset to confirm correct column names
View(seeds_dataset)
# Store dataset in a variable for processing
sd <- seeds_dataset
# Check structure and summary
summary(sd)
## area perimeter compactness length_of_kernel
## Min. :10.59 Min. :12.41 Min. :0.8081 Min. :4.899
## 1st Qu.:12.27 1st Qu.:13.45 1st Qu.:0.8569 1st Qu.:5.262
## Median :14.36 Median :14.32 Median :0.8734 Median :5.524
## Mean :14.85 Mean :14.56 Mean :0.8710 Mean :5.629
## 3rd Qu.:17.30 3rd Qu.:15.71 3rd Qu.:0.8878 3rd Qu.:5.980
## Max. :21.18 Max. :17.25 Max. :0.9183 Max. :6.675
## width_of_kernel asymmetry_coefficient length_of_kernel_groove type_of_seed
## Min. :2.630 Min. :0.7651 Min. :4.519 Min. :1
## 1st Qu.:2.944 1st Qu.:2.5615 1st Qu.:5.045 1st Qu.:1
## Median :3.237 Median :3.5990 Median :5.223 Median :2
## Mean :3.259 Mean :3.7002 Mean :5.408 Mean :2
## 3rd Qu.:3.562 3rd Qu.:4.7687 3rd Qu.:5.877 3rd Qu.:3
## Max. :4.033 Max. :8.4560 Max. :6.550 Max. :3
str(sd)
## 'data.frame': 210 obs. of 8 variables:
## $ area : num 15.3 14.9 14.3 13.8 16.1 ...
## $ perimeter : num 14.8 14.6 14.1 13.9 15 ...
## $ compactness : num 0.871 0.881 0.905 0.895 0.903 ...
## $ length_of_kernel : num 5.76 5.55 5.29 5.32 5.66 ...
## $ width_of_kernel : num 3.31 3.33 3.34 3.38 3.56 ...
## $ asymmetry_coefficient : num 2.22 1.02 2.7 2.26 1.35 ...
## $ length_of_kernel_groove: num 5.22 4.96 4.83 4.8 5.17 ...
## $ type_of_seed : int 1 1 1 1 1 1 1 1 1 1 ...
# Check for missing values
any(is.na(sd))
## [1] FALSE
# Remove missing values (if any exist)
sd <- na.omit(sd)
# Confirm dataset structure after cleaning
str(sd)
## 'data.frame': 210 obs. of 8 variables:
## $ area : num 15.3 14.9 14.3 13.8 16.1 ...
## $ perimeter : num 14.8 14.6 14.1 13.9 15 ...
## $ compactness : num 0.871 0.881 0.905 0.895 0.903 ...
## $ length_of_kernel : num 5.76 5.55 5.29 5.32 5.66 ...
## $ width_of_kernel : num 3.31 3.33 3.34 3.38 3.56 ...
## $ asymmetry_coefficient : num 2.22 1.02 2.7 2.26 1.35 ...
## $ length_of_kernel_groove: num 5.22 4.96 4.83 4.8 5.17 ...
## $ type_of_seed : int 1 1 1 1 1 1 1 1 1 1 ...
# Store 'type_of_seed' separately before normalization
seeds_label <- sd$type_of_seed
# Remove categorical column before scaling
sd$type_of_seed <- NULL
# Normalize (scale) the dataset
sd_scaled <- as.data.frame(scale(sd))
# Summary of the scaled dataset
summary(sd_scaled)
## area perimeter compactness length_of_kernel
## Min. :-1.4632 Min. :-1.6458 Min. :-2.6619 Min. :-1.6466
## 1st Qu.:-0.8858 1st Qu.:-0.8494 1st Qu.:-0.5967 1st Qu.:-0.8267
## Median :-0.1693 Median :-0.1832 Median : 0.1037 Median :-0.2371
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.8446 3rd Qu.: 0.8850 3rd Qu.: 0.7100 3rd Qu.: 0.7927
## Max. : 2.1763 Max. : 2.0603 Max. : 2.0018 Max. : 2.3619
## width_of_kernel asymmetry_coefficient length_of_kernel_groove
## Min. :-1.6642 Min. :-1.95210 Min. :-1.8090
## 1st Qu.:-0.8329 1st Qu.:-0.75734 1st Qu.:-0.7387
## Median :-0.0572 Median :-0.06731 Median :-0.3766
## Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.8026 3rd Qu.: 0.71068 3rd Qu.: 0.9541
## Max. : 2.0502 Max. : 3.16303 Max. : 2.3234
# View the processed dataset
View(sd_scaled)
# Use your dataset (replace 'HeartFailure' with your actual dataset)
data <- sd # Ensure the dataset contains only numeric columns
# Remove non-numeric columns if needed
data <- data[, sapply(data, is.numeric)]
# Standardize (scale) the data
data_scaled <- scale(data)
# Compute Euclidean distance matrix
dist_matrix <- dist(data_scaled, method = "euclidean")
# Perform hierarchical clustering using the average linkage method
hclust_avg <- hclust(dist_matrix, method = "average")
# Plot the dendrogram
plot(hclust_avg, main = "Cluster Dendrogram", xlab = "", sub = "", cex = 0.6)
# Convert to dendrogram object
dend <- as.dendrogram(hclust_avg)
# Customize the dendrogram appearance
dend <- color_branches(dend, k = 3) # Color branches by k clusters
plot(dend, main = "Enhanced Cluster Dendrogram")
# Standardize the dataset (assuming 'data' contains numeric values)
data_scaled <- scale(data)
# Compute Euclidean distance matrix
dist_matrix <- dist(data_scaled, method = "euclidean")
# Perform hierarchical clustering using the average linkage method
hclust_avg <- hclust(dist_matrix, method = "average")
# Plot the dendrogram
plot(hclust_avg, main = "Cluster Dendrogram", xlab = "", sub = "", cex = 0.6)
# Add cluster rectangles (highlight 3 clusters)
rect.hclust(hclust_avg, k = 3, border = c("blue", "green", "red"))
# Add a horizontal line at height = 3 for visual separation
abline(h = 3, col = 'red')
# Perform k-means clustering with 3 clusters
set.seed(786) # For reproducibility
data_cl <- kmeans(data_scaled, centers = 3, nstart = 25)
# Compare cluster assignments with actual seed types
table(data_cl$cluster, seeds_label)
## seeds_label
## 1 2 3
## 1 2 65 0
## 2 62 5 4
## 3 6 0 66
# Load necessary library
# Ensure data_cl contains necessary columns
data_cl <- data.frame(area = data$area,
perimeter = data$perimeter,
cluster = data_cl$cluster) # Ensure cluster is assigned
# Scatter plot colored by clusters
ggplot(data_cl, aes(x = area, y = perimeter, color = factor(cluster))) +
geom_point() +
labs(title = "Clustering Results", x = "Area", y = "Perimeter", color = "Cluster") +
theme_minimal()
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.