Project_Germination

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

Importing the dataset

library(readr)
Germination <- read_csv("C:/GGTUAN/DREAMS/Yankee/TSU/MSc_TSU/Spring_2024/CS-583 Data Minning/Project_Data/Germination.csv")

## Rows: 21 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): seed
## dbl (3): rownames, n, y
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

View(Germination)

germ_df <- Germination

Checking out values of the datasets - Structures , NA etc

germ_df

## # A tibble: 21 × 4
##    rownames seed      n     y
##       <dbl> <chr> <dbl> <dbl>
##  1        1 O75      39    10
##  2        2 O75      62    23
##  3        3 O75      81    23
##  4        4 O75      51    26
##  5        5 O75      39    17
##  6        6 O75       6     5
##  7        7 O75      74    53
##  8        8 O75      72    55
##  9        9 O75      51    32
## 10       10 O75      79    46
## # ℹ 11 more rows

#View(germ_df)
str(germ_df)

## spc_tbl_ [21 × 4] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ rownames: num [1:21] 1 2 3 4 5 6 7 8 9 10 ...
##  $ seed    : chr [1:21] "O75" "O75" "O75" "O75" ...
##  $ n       : num [1:21] 39 62 81 51 39 6 74 72 51 79 ...
##  $ y       : num [1:21] 10 23 23 26 17 5 53 55 32 46 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   rownames = col_double(),
##   ..   seed = col_character(),
##   ..   n = col_double(),
##   ..   y = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

dim(germ_df)

## [1] 21  4

germ_df$seed

##  [1] "O75" "O75" "O75" "O75" "O75" "O75" "O75" "O75" "O75" "O75" "O75" "O73"
## [13] "O73" "O73" "O73" "O73" "O73" "O73" "O73" "O73" "O73"

#Drop the rownames
germ_df1 <- germ_df[,-1]

#Change seed to factor and make it numeric
germ_df1$seed <- as.integer(factor(germ_df1$seed))


str(germ_df1$seed)

##  int [1:21] 2 2 2 2 2 2 2 2 2 2 ...

is.na(germ_df1)

##        seed     n     y
##  [1,] FALSE FALSE FALSE
##  [2,] FALSE FALSE FALSE
##  [3,] FALSE FALSE FALSE
##  [4,] FALSE FALSE FALSE
##  [5,] FALSE FALSE FALSE
##  [6,] FALSE FALSE FALSE
##  [7,] FALSE FALSE FALSE
##  [8,] FALSE FALSE FALSE
##  [9,] FALSE FALSE FALSE
## [10,] FALSE FALSE FALSE
## [11,] FALSE FALSE FALSE
## [12,] FALSE FALSE FALSE
## [13,] FALSE FALSE FALSE
## [14,] FALSE FALSE FALSE
## [15,] FALSE FALSE FALSE
## [16,] FALSE FALSE FALSE
## [17,] FALSE FALSE FALSE
## [18,] FALSE FALSE FALSE
## [19,] FALSE FALSE FALSE
## [20,] FALSE FALSE FALSE
## [21,] FALSE FALSE FALSE

sum(is.na(germ_df1))

## [1] 0

table(germ_df1$seed)

## 
##  1  2 
## 10 11

prop.table(table(germ_df1$seed))

## 
##         1         2 
## 0.4761905 0.5238095

Scaling the dataset to suppress the impact skewness or high values on others

Here we use “euclidean” because it is best suited for our task which are in continous value.

Manhattan is good for binary

germ_df1

## # A tibble: 21 × 3
##     seed     n     y
##    <int> <dbl> <dbl>
##  1     2    39    10
##  2     2    62    23
##  3     2    81    23
##  4     2    51    26
##  5     2    39    17
##  6     2     6     5
##  7     2    74    53
##  8     2    72    55
##  9     2    51    32
## 10     2    79    46
## # ℹ 11 more rows

dim(germ_df1)

## [1] 21  3

View(germ_df1)
str(germ_df1)

## tibble [21 × 3] (S3: tbl_df/tbl/data.frame)
##  $ seed: int [1:21] 2 2 2 2 2 2 2 2 2 2 ...
##  $ n   : num [1:21] 39 62 81 51 39 6 74 72 51 79 ...
##  $ y   : num [1:21] 10 23 23 26 17 5 53 55 32 46 ...

germ_df1_scale <- as.data.frame(scale(germ_df1))
germ_df1_scale

##          seed           n          y
## 1   0.9304842 -0.02311077 -0.6338642
## 2   0.9304842  0.90709781  0.1747569
## 3   0.9304842  1.67553098  0.1747569
## 4   0.9304842  0.46221544  0.3613618
## 5   0.9304842 -0.02311077 -0.1984528
## 6   0.9304842 -1.35775786 -0.9448723
## 7   0.9304842  1.39242402  2.0408057
## 8   0.9304842  1.31153632  2.1652089
## 9   0.9304842  0.46221544  0.7345716
## 10  0.9304842  1.59464328  1.6053943
## 11  0.9304842 -1.07465091 -0.6338642
## 12 -1.0235326 -0.95331935 -0.7582674
## 13 -1.0235326 -0.38710543 -0.6338642
## 14 -1.0235326 -0.46799314 -0.7582674
## 15 -1.0235326  0.21955234  0.1747569
## 16 -1.0235326 -1.43864557 -1.2558804
## 17 -1.0235326 -1.11509476 -1.0692756
## 18 -1.0235326  0.05777693  0.1125553
## 19 -1.0235326 -0.38710543 -0.3228561
## 20 -1.0235326  0.46221544  0.7345716
## 21 -1.0235326 -1.31731401 -1.0692756

str(germ_df1_scale)

## 'data.frame':    21 obs. of  3 variables:
##  $ seed: num  0.93 0.93 0.93 0.93 0.93 ...
##  $ n   : num  -0.0231 0.9071 1.6755 0.4622 -0.0231 ...
##  $ y   : num  -0.634 0.175 0.175 0.361 -0.198 ...

View(germ_df1_scale)
dist_mat <- dist(germ_df1_scale, method="euclidean")
#View(dist_mat)

Comparison of cluster methods and Silhouttes coefficients

library(fpc)

## Warning: package 'fpc' was built under R version 4.3.3

library (cluster)
library (vegan)

## Warning: package 'vegan' was built under R version 4.3.3

## Loading required package: permute

## Warning: package 'permute' was built under R version 4.3.3

## Loading required package: lattice

## This is vegan 2.6-4

h_germ_df1<- hclust(dist_mat, method='average')
h_germ_df1_single<- hclust(dist_mat, method='single')
h_germ_df1_complete<- hclust(dist_mat, method='complete')

#show summary of cluster for method='average'
h_germ_df1

## 
## Call:
## hclust(d = dist_mat, method = "average")
## 
## Cluster method   : average 
## Distance         : euclidean 
## Number of objects: 21

##fixing margin too large error
#par(mar)
par(mar = c(1, 1, 1, 1))
plot(h_germ_df1)

#show summary of cluster for method='single'
h_germ_df1_single

## 
## Call:
## hclust(d = dist_mat, method = "single")
## 
## Cluster method   : single 
## Distance         : euclidean 
## Number of objects: 21

par(mar = c(1, 1, 1, 1))
plot(h_germ_df1_single)

#show summary of cluster for method='complete'
h_germ_df1_complete

## 
## Call:
## hclust(d = dist_mat, method = "complete")
## 
## Cluster method   : complete 
## Distance         : euclidean 
## Number of objects: 21

par(mar = c(1, 1, 1, 1))
plot(h_germ_df1_complete)

### Comparison of silhouette coefficients
################

set.seed(1234)
#d <- dist(scale(iris[ , -5]))
d <- dist(germ_df1_scale, method="euclidean")
methds <- c('complete', 'single', 'average')
avgS <- matrix(NA, ncol=3, nrow=5,
               dimnames=list(2:6, methds))
for(k in 2:6)
  for(m in seq_along(methds)) {
    h <- hclust(d, meth=methds[m])
    c <- cutree(h, k)
    s <- silhouette(c, d)
    #plot(silhouette(c, d))
    avgS[k-1, m] <- mean(s[ , 3])
  }
avgS

##    complete    single   average
## 2 0.3692837 0.4475646 0.4116283
## 3 0.3475103 0.4804590 0.4804590
## 4 0.4990830 0.5035927 0.5035927
## 5 0.5468301 0.4559720 0.5524545
## 6 0.5677947 0.5005272 0.5080595

plot(silhouette(c, d))

####################3

h_germ_df1<- hclust(dist_mat, method='average')
h_germ_df1_single<- hclust(dist_mat, method='single')
h_germ_df1_complete<- hclust(dist_mat, method='complete')

#show summary of cluster for method='average'
h_germ_df1

## 
## Call:
## hclust(d = dist_mat, method = "average")
## 
## Cluster method   : average 
## Distance         : euclidean 
## Number of objects: 21

##fixing margin too large error
#par(mar)
par(mar = c(1, 1, 1, 1))
plot(h_germ_df1)

#show summary of cluster for method='single'
h_germ_df1_single

## 
## Call:
## hclust(d = dist_mat, method = "single")
## 
## Cluster method   : single 
## Distance         : euclidean 
## Number of objects: 21

par(mar = c(1, 1, 1, 1))
plot(h_germ_df1_single)

#show summary of cluster for method='complete' and this is better
h_germ_df1_complete

## 
## Call:
## hclust(d = dist_mat, method = "complete")
## 
## Cluster method   : complete 
## Distance         : euclidean 
## Number of objects: 21

par(mar = c(1, 1, 1, 1))
plot(h_germ_df1_complete)

##Checking out cluster … Complete

hca <- h_germ_df1_complete
plot(hca)
rect.hclust(hca, k = 2, border = "red")
x <- rect.hclust(hca, h = 2, which = c(1,2), border = 2:5)

library(dendextend)

## Registered S3 method overwritten by 'dendextend':
##   method     from 
##   rev.hclust vegan

## 
## ---------------------
## Welcome to dendextend version 1.17.1
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags: 
##   https://stackoverflow.com/questions/tagged/dendextend
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------

## 
## Attaching package: 'dendextend'

## The following object is masked from 'package:permute':
## 
##     shuffle

## The following object is masked from 'package:stats':
## 
##     cutree

avg_dendogram_obj <- as.dendrogram(h_germ_df1_complete)
avg_col_dendogram <- color_branches(avg_dendogram_obj, h=2)
plot(avg_col_dendogram)

hac_cut <- cutree(h_germ_df1_complete, k=2)

Scale and compare original data to clustering classfication

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

germ_df_cl <- mutate(germ_df1_scale, cluster=hac_cut)
germ_df_cl

##          seed           n          y cluster
## 1   0.9304842 -0.02311077 -0.6338642       1
## 2   0.9304842  0.90709781  0.1747569       2
## 3   0.9304842  1.67553098  0.1747569       2
## 4   0.9304842  0.46221544  0.3613618       2
## 5   0.9304842 -0.02311077 -0.1984528       1
## 6   0.9304842 -1.35775786 -0.9448723       1
## 7   0.9304842  1.39242402  2.0408057       2
## 8   0.9304842  1.31153632  2.1652089       2
## 9   0.9304842  0.46221544  0.7345716       2
## 10  0.9304842  1.59464328  1.6053943       2
## 11  0.9304842 -1.07465091 -0.6338642       1
## 12 -1.0235326 -0.95331935 -0.7582674       1
## 13 -1.0235326 -0.38710543 -0.6338642       1
## 14 -1.0235326 -0.46799314 -0.7582674       1
## 15 -1.0235326  0.21955234  0.1747569       2
## 16 -1.0235326 -1.43864557 -1.2558804       1
## 17 -1.0235326 -1.11509476 -1.0692756       1
## 18 -1.0235326  0.05777693  0.1125553       2
## 19 -1.0235326 -0.38710543 -0.3228561       1
## 20 -1.0235326  0.46221544  0.7345716       2
## 21 -1.0235326 -1.31731401 -1.0692756       1

## Count Cluster classification
count(germ_df_cl, cluster)

##   cluster  n
## 1       1 11
## 2       2 10

## compare with Original Dataset classification
count(germ_df, seed)

## # A tibble: 2 × 2
##   seed      n
##   <chr> <int>
## 1 O73      10
## 2 O75      11

Draw the plot

library(ggplot2)
##ggplot(seeds_df_cl, aes(x=seeds_df_cl$area, y=seeds_df_cl$perimeter , color=factor(cluster))) + geom_point()

ggplot(germ_df_cl, aes(x=n , y=y  , color=factor(cluster))) + geom_point()

#Use of `seeds_df_cl$perimeter` is discouraged.ℹ Use `perimeter` instead. 
##ggplot(seeds_df_cl, aes(x=area, y=perimeter , color=factor(cluster))) + geom_point()

ggplot(germ_df, aes(x=n, y=y , color=factor(germ_df1$seed))) + geom_point()

#Install clvalid
#library(clValid)
#dunn(dist_mat, hac_cut)

compute Dunn’s index

#Install clvalid
library(clValid)
dunn(dist_mat, hac_cut)

## [1] 0.2008738

dist_mat[1:21]

##  [1] 1.2325405 1.8812900 1.1072562 0.4354114 1.3704047 3.0261523 3.1009826
##  [8] 1.4519497 2.7625001 1.0515401 2.1677052 1.9876302 2.0078791 2.1285994
## [15] 2.4917513 2.2803934 2.0932909 2.0118151 2.4344075 2.3838471 0.7684332

###KNN

set.seed(1234)
germ_knn <- kmeans(germ_df1, center=2, iter.max=10, nstart=10)
germ_knn

## K-means clustering with 2 clusters of sizes 12, 9
## 
## Cluster means:
##       seed        n        y
## 1 1.333333 22.08333  9.25000
## 2 1.777778 62.88889 34.77778
## 
## Clustering vector:
##  [1] 1 2 2 2 1 1 2 2 2 2 1 1 1 1 2 1 1 1 1 2 1
## 
## Within cluster sum of squares by cluster:
## [1] 2569.833 2916.000
##  (between_SS / total_SS =  68.5 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

germ_knn$betweenss

## [1] 11915.79

germ_knn$withinss

## [1] 2569.833 2916.000

germ_knn$tot.withinss

## [1] 5485.833

table(germ_knn$cluster, germ_df1$seed)

##    
##     1 2
##   1 8 4
##   2 2 7

cm <- table(germ_knn$cluster, germ_df1$seed)
1 - sum(diag(cm))/sum(cm)

## [1] 0.2857143