This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(readr)
Germination <- read_csv("C:/GGTUAN/DREAMS/Yankee/TSU/MSc_TSU/Spring_2024/CS-583 Data Minning/Project_Data/Germination.csv")
## Rows: 21 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): seed
## dbl (3): rownames, n, y
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(Germination)
germ_df <- Germination
germ_df
## # A tibble: 21 × 4
## rownames seed n y
## <dbl> <chr> <dbl> <dbl>
## 1 1 O75 39 10
## 2 2 O75 62 23
## 3 3 O75 81 23
## 4 4 O75 51 26
## 5 5 O75 39 17
## 6 6 O75 6 5
## 7 7 O75 74 53
## 8 8 O75 72 55
## 9 9 O75 51 32
## 10 10 O75 79 46
## # ℹ 11 more rows
#View(germ_df)
str(germ_df)
## spc_tbl_ [21 × 4] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ rownames: num [1:21] 1 2 3 4 5 6 7 8 9 10 ...
## $ seed : chr [1:21] "O75" "O75" "O75" "O75" ...
## $ n : num [1:21] 39 62 81 51 39 6 74 72 51 79 ...
## $ y : num [1:21] 10 23 23 26 17 5 53 55 32 46 ...
## - attr(*, "spec")=
## .. cols(
## .. rownames = col_double(),
## .. seed = col_character(),
## .. n = col_double(),
## .. y = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
dim(germ_df)
## [1] 21 4
germ_df$seed
## [1] "O75" "O75" "O75" "O75" "O75" "O75" "O75" "O75" "O75" "O75" "O75" "O73"
## [13] "O73" "O73" "O73" "O73" "O73" "O73" "O73" "O73" "O73"
#Drop the rownames
germ_df1 <- germ_df[,-1]
#Change seed to factor and make it numeric
germ_df1$seed <- as.integer(factor(germ_df1$seed))
str(germ_df1$seed)
## int [1:21] 2 2 2 2 2 2 2 2 2 2 ...
is.na(germ_df1)
## seed n y
## [1,] FALSE FALSE FALSE
## [2,] FALSE FALSE FALSE
## [3,] FALSE FALSE FALSE
## [4,] FALSE FALSE FALSE
## [5,] FALSE FALSE FALSE
## [6,] FALSE FALSE FALSE
## [7,] FALSE FALSE FALSE
## [8,] FALSE FALSE FALSE
## [9,] FALSE FALSE FALSE
## [10,] FALSE FALSE FALSE
## [11,] FALSE FALSE FALSE
## [12,] FALSE FALSE FALSE
## [13,] FALSE FALSE FALSE
## [14,] FALSE FALSE FALSE
## [15,] FALSE FALSE FALSE
## [16,] FALSE FALSE FALSE
## [17,] FALSE FALSE FALSE
## [18,] FALSE FALSE FALSE
## [19,] FALSE FALSE FALSE
## [20,] FALSE FALSE FALSE
## [21,] FALSE FALSE FALSE
sum(is.na(germ_df1))
## [1] 0
table(germ_df1$seed)
##
## 1 2
## 10 11
prop.table(table(germ_df1$seed))
##
## 1 2
## 0.4761905 0.5238095
germ_df1
## # A tibble: 21 × 3
## seed n y
## <int> <dbl> <dbl>
## 1 2 39 10
## 2 2 62 23
## 3 2 81 23
## 4 2 51 26
## 5 2 39 17
## 6 2 6 5
## 7 2 74 53
## 8 2 72 55
## 9 2 51 32
## 10 2 79 46
## # ℹ 11 more rows
dim(germ_df1)
## [1] 21 3
View(germ_df1)
str(germ_df1)
## tibble [21 × 3] (S3: tbl_df/tbl/data.frame)
## $ seed: int [1:21] 2 2 2 2 2 2 2 2 2 2 ...
## $ n : num [1:21] 39 62 81 51 39 6 74 72 51 79 ...
## $ y : num [1:21] 10 23 23 26 17 5 53 55 32 46 ...
germ_df1_scale <- as.data.frame(scale(germ_df1))
germ_df1_scale
## seed n y
## 1 0.9304842 -0.02311077 -0.6338642
## 2 0.9304842 0.90709781 0.1747569
## 3 0.9304842 1.67553098 0.1747569
## 4 0.9304842 0.46221544 0.3613618
## 5 0.9304842 -0.02311077 -0.1984528
## 6 0.9304842 -1.35775786 -0.9448723
## 7 0.9304842 1.39242402 2.0408057
## 8 0.9304842 1.31153632 2.1652089
## 9 0.9304842 0.46221544 0.7345716
## 10 0.9304842 1.59464328 1.6053943
## 11 0.9304842 -1.07465091 -0.6338642
## 12 -1.0235326 -0.95331935 -0.7582674
## 13 -1.0235326 -0.38710543 -0.6338642
## 14 -1.0235326 -0.46799314 -0.7582674
## 15 -1.0235326 0.21955234 0.1747569
## 16 -1.0235326 -1.43864557 -1.2558804
## 17 -1.0235326 -1.11509476 -1.0692756
## 18 -1.0235326 0.05777693 0.1125553
## 19 -1.0235326 -0.38710543 -0.3228561
## 20 -1.0235326 0.46221544 0.7345716
## 21 -1.0235326 -1.31731401 -1.0692756
str(germ_df1_scale)
## 'data.frame': 21 obs. of 3 variables:
## $ seed: num 0.93 0.93 0.93 0.93 0.93 ...
## $ n : num -0.0231 0.9071 1.6755 0.4622 -0.0231 ...
## $ y : num -0.634 0.175 0.175 0.361 -0.198 ...
View(germ_df1_scale)
dist_mat <- dist(germ_df1_scale, method="euclidean")
#View(dist_mat)
library(fpc)
## Warning: package 'fpc' was built under R version 4.3.3
library (cluster)
library (vegan)
## Warning: package 'vegan' was built under R version 4.3.3
## Loading required package: permute
## Warning: package 'permute' was built under R version 4.3.3
## Loading required package: lattice
## This is vegan 2.6-4
h_germ_df1<- hclust(dist_mat, method='average')
h_germ_df1_single<- hclust(dist_mat, method='single')
h_germ_df1_complete<- hclust(dist_mat, method='complete')
#show summary of cluster for method='average'
h_germ_df1
##
## Call:
## hclust(d = dist_mat, method = "average")
##
## Cluster method : average
## Distance : euclidean
## Number of objects: 21
##fixing margin too large error
#par(mar)
par(mar = c(1, 1, 1, 1))
plot(h_germ_df1)
#show summary of cluster for method='single'
h_germ_df1_single
##
## Call:
## hclust(d = dist_mat, method = "single")
##
## Cluster method : single
## Distance : euclidean
## Number of objects: 21
par(mar = c(1, 1, 1, 1))
plot(h_germ_df1_single)
#show summary of cluster for method='complete'
h_germ_df1_complete
##
## Call:
## hclust(d = dist_mat, method = "complete")
##
## Cluster method : complete
## Distance : euclidean
## Number of objects: 21
par(mar = c(1, 1, 1, 1))
plot(h_germ_df1_complete)
### Comparison of silhouette coefficients
################
set.seed(1234)
#d <- dist(scale(iris[ , -5]))
d <- dist(germ_df1_scale, method="euclidean")
methds <- c('complete', 'single', 'average')
avgS <- matrix(NA, ncol=3, nrow=5,
dimnames=list(2:6, methds))
for(k in 2:6)
for(m in seq_along(methds)) {
h <- hclust(d, meth=methds[m])
c <- cutree(h, k)
s <- silhouette(c, d)
#plot(silhouette(c, d))
avgS[k-1, m] <- mean(s[ , 3])
}
avgS
## complete single average
## 2 0.3692837 0.4475646 0.4116283
## 3 0.3475103 0.4804590 0.4804590
## 4 0.4990830 0.5035927 0.5035927
## 5 0.5468301 0.4559720 0.5524545
## 6 0.5677947 0.5005272 0.5080595
plot(silhouette(c, d))
####################3
h_germ_df1<- hclust(dist_mat, method='average')
h_germ_df1_single<- hclust(dist_mat, method='single')
h_germ_df1_complete<- hclust(dist_mat, method='complete')
#show summary of cluster for method='average'
h_germ_df1
##
## Call:
## hclust(d = dist_mat, method = "average")
##
## Cluster method : average
## Distance : euclidean
## Number of objects: 21
##fixing margin too large error
#par(mar)
par(mar = c(1, 1, 1, 1))
plot(h_germ_df1)
#show summary of cluster for method='single'
h_germ_df1_single
##
## Call:
## hclust(d = dist_mat, method = "single")
##
## Cluster method : single
## Distance : euclidean
## Number of objects: 21
par(mar = c(1, 1, 1, 1))
plot(h_germ_df1_single)
#show summary of cluster for method='complete' and this is better
h_germ_df1_complete
##
## Call:
## hclust(d = dist_mat, method = "complete")
##
## Cluster method : complete
## Distance : euclidean
## Number of objects: 21
par(mar = c(1, 1, 1, 1))
plot(h_germ_df1_complete)
##Checking out cluster … Complete
hca <- h_germ_df1_complete
plot(hca)
rect.hclust(hca, k = 2, border = "red")
x <- rect.hclust(hca, h = 2, which = c(1,2), border = 2:5)
library(dendextend)
## Registered S3 method overwritten by 'dendextend':
## method from
## rev.hclust vegan
##
## ---------------------
## Welcome to dendextend version 1.17.1
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags:
## https://stackoverflow.com/questions/tagged/dendextend
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:permute':
##
## shuffle
## The following object is masked from 'package:stats':
##
## cutree
avg_dendogram_obj <- as.dendrogram(h_germ_df1_complete)
avg_col_dendogram <- color_branches(avg_dendogram_obj, h=2)
plot(avg_col_dendogram)
hac_cut <- cutree(h_germ_df1_complete, k=2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
germ_df_cl <- mutate(germ_df1_scale, cluster=hac_cut)
germ_df_cl
## seed n y cluster
## 1 0.9304842 -0.02311077 -0.6338642 1
## 2 0.9304842 0.90709781 0.1747569 2
## 3 0.9304842 1.67553098 0.1747569 2
## 4 0.9304842 0.46221544 0.3613618 2
## 5 0.9304842 -0.02311077 -0.1984528 1
## 6 0.9304842 -1.35775786 -0.9448723 1
## 7 0.9304842 1.39242402 2.0408057 2
## 8 0.9304842 1.31153632 2.1652089 2
## 9 0.9304842 0.46221544 0.7345716 2
## 10 0.9304842 1.59464328 1.6053943 2
## 11 0.9304842 -1.07465091 -0.6338642 1
## 12 -1.0235326 -0.95331935 -0.7582674 1
## 13 -1.0235326 -0.38710543 -0.6338642 1
## 14 -1.0235326 -0.46799314 -0.7582674 1
## 15 -1.0235326 0.21955234 0.1747569 2
## 16 -1.0235326 -1.43864557 -1.2558804 1
## 17 -1.0235326 -1.11509476 -1.0692756 1
## 18 -1.0235326 0.05777693 0.1125553 2
## 19 -1.0235326 -0.38710543 -0.3228561 1
## 20 -1.0235326 0.46221544 0.7345716 2
## 21 -1.0235326 -1.31731401 -1.0692756 1
## Count Cluster classification
count(germ_df_cl, cluster)
## cluster n
## 1 1 11
## 2 2 10
## compare with Original Dataset classification
count(germ_df, seed)
## # A tibble: 2 × 2
## seed n
## <chr> <int>
## 1 O73 10
## 2 O75 11
library(ggplot2)
##ggplot(seeds_df_cl, aes(x=seeds_df_cl$area, y=seeds_df_cl$perimeter , color=factor(cluster))) + geom_point()
ggplot(germ_df_cl, aes(x=n , y=y , color=factor(cluster))) + geom_point()
#Use of `seeds_df_cl$perimeter` is discouraged.ℹ Use `perimeter` instead.
##ggplot(seeds_df_cl, aes(x=area, y=perimeter , color=factor(cluster))) + geom_point()
ggplot(germ_df, aes(x=n, y=y , color=factor(germ_df1$seed))) + geom_point()
#Install clvalid
#library(clValid)
#dunn(dist_mat, hac_cut)
#Install clvalid
library(clValid)
dunn(dist_mat, hac_cut)
## [1] 0.2008738
dist_mat[1:21]
## [1] 1.2325405 1.8812900 1.1072562 0.4354114 1.3704047 3.0261523 3.1009826
## [8] 1.4519497 2.7625001 1.0515401 2.1677052 1.9876302 2.0078791 2.1285994
## [15] 2.4917513 2.2803934 2.0932909 2.0118151 2.4344075 2.3838471 0.7684332
###KNN
set.seed(1234)
germ_knn <- kmeans(germ_df1, center=2, iter.max=10, nstart=10)
germ_knn
## K-means clustering with 2 clusters of sizes 12, 9
##
## Cluster means:
## seed n y
## 1 1.333333 22.08333 9.25000
## 2 1.777778 62.88889 34.77778
##
## Clustering vector:
## [1] 1 2 2 2 1 1 2 2 2 2 1 1 1 1 2 1 1 1 1 2 1
##
## Within cluster sum of squares by cluster:
## [1] 2569.833 2916.000
## (between_SS / total_SS = 68.5 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
germ_knn$betweenss
## [1] 11915.79
germ_knn$withinss
## [1] 2569.833 2916.000
germ_knn$tot.withinss
## [1] 5485.833
table(germ_knn$cluster, germ_df1$seed)
##
## 1 2
## 1 8 4
## 2 2 7
cm <- table(germ_knn$cluster, germ_df1$seed)
1 - sum(diag(cm))/sum(cm)
## [1] 0.2857143