R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

Importing the dataset

library(readr)
Germination <- read_csv("C:/GGTUAN/DREAMS/Yankee/TSU/MSc_TSU/Spring_2024/CS-583 Data Minning/Project_Data/Germination.csv")
## Rows: 21 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): seed
## dbl (3): rownames, n, y
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(Germination)

germ_df <- Germination

Checking out values of the datasets

germ_df
## # A tibble: 21 × 4
##    rownames seed      n     y
##       <dbl> <chr> <dbl> <dbl>
##  1        1 O75      39    10
##  2        2 O75      62    23
##  3        3 O75      81    23
##  4        4 O75      51    26
##  5        5 O75      39    17
##  6        6 O75       6     5
##  7        7 O75      74    53
##  8        8 O75      72    55
##  9        9 O75      51    32
## 10       10 O75      79    46
## # ℹ 11 more rows
View(germ_df)
str(germ_df)
## spc_tbl_ [21 × 4] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ rownames: num [1:21] 1 2 3 4 5 6 7 8 9 10 ...
##  $ seed    : chr [1:21] "O75" "O75" "O75" "O75" ...
##  $ n       : num [1:21] 39 62 81 51 39 6 74 72 51 79 ...
##  $ y       : num [1:21] 10 23 23 26 17 5 53 55 32 46 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   rownames = col_double(),
##   ..   seed = col_character(),
##   ..   n = col_double(),
##   ..   y = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
dim(germ_df)
## [1] 21  4
germ_df$seed
##  [1] "O75" "O75" "O75" "O75" "O75" "O75" "O75" "O75" "O75" "O75" "O75" "O73"
## [13] "O73" "O73" "O73" "O73" "O73" "O73" "O73" "O73" "O73"
table(germ_df$seed)
## 
## O73 O75 
##  10  11
prop.table(table(germ_df$seed))
## 
##       O73       O75 
## 0.4761905 0.5238095
germ_df1 <- germ_df[,-2]
germ_df1
## # A tibble: 21 × 3
##    rownames     n     y
##       <dbl> <dbl> <dbl>
##  1        1    39    10
##  2        2    62    23
##  3        3    81    23
##  4        4    51    26
##  5        5    39    17
##  6        6     6     5
##  7        7    74    53
##  8        8    72    55
##  9        9    51    32
## 10       10    79    46
## # ℹ 11 more rows
dim(germ_df1)
## [1] 21  3
View(germ_df1)
germ_df1_scale <- as.data.frame(scale(germ_df1))
str(germ_df1_scale)
## 'data.frame':    21 obs. of  3 variables:
##  $ rownames: num  -1.612 -1.45 -1.289 -1.128 -0.967 ...
##  $ n       : num  -0.0231 0.9071 1.6755 0.4622 -0.0231 ...
##  $ y       : num  -0.634 0.175 0.175 0.361 -0.198 ...
View(germ_df1_scale)
dist_mat <- dist(germ_df1_scale, method="euclidean")
View(dist_mat)
h_germ_df1<- hclust(dist_mat, method='average')
#show summary of cluster
h_germ_df1
## 
## Call:
## hclust(d = dist_mat, method = "average")
## 
## Cluster method   : average 
## Distance         : euclidean 
## Number of objects: 21
##fixing margin too large error
#par(mar)
par(mar = c(1, 1, 1, 1))
plot(h_germ_df1)

##Checking out cluster … In this case, it is just 2

hca <- h_germ_df1
plot(hca)
rect.hclust(hca, k = 2, border = "red")
x <- rect.hclust(hca, h = 2, which = c(1,3), border = 2:5)

x
## [[1]]
##  [1]  6 11 12 13 14 15 16 17 18 19 20 21
## 
## [[2]]
## [1] 1 2 3 4 5 9
library(dendextend)
## 
## ---------------------
## Welcome to dendextend version 1.17.1
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags: 
##   https://stackoverflow.com/questions/tagged/dendextend
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
## 
##     cutree
avg_dendogram_obj <- as.dendrogram(h_germ_df1)
avg_col_dendogram <- color_branches(avg_dendogram_obj, h=2)
plot(avg_col_dendogram)

hac_cut <- cutree(h_germ_df1, k=2)

Scale and compare original data to clustering classfication

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
germ_df_cl <- mutate(germ_df1_scale, cluster=hac_cut)
germ_df_cl
##      rownames           n          y cluster
## 1  -1.6116459 -0.02311077 -0.6338642       1
## 2  -1.4504813  0.90709781  0.1747569       1
## 3  -1.2893167  1.67553098  0.1747569       1
## 4  -1.1281521  0.46221544  0.3613618       1
## 5  -0.9669876 -0.02311077 -0.1984528       1
## 6  -0.8058230 -1.35775786 -0.9448723       2
## 7  -0.6446584  1.39242402  2.0408057       1
## 8  -0.4834938  1.31153632  2.1652089       1
## 9  -0.3223292  0.46221544  0.7345716       1
## 10 -0.1611646  1.59464328  1.6053943       1
## 11  0.0000000 -1.07465091 -0.6338642       2
## 12  0.1611646 -0.95331935 -0.7582674       2
## 13  0.3223292 -0.38710543 -0.6338642       2
## 14  0.4834938 -0.46799314 -0.7582674       2
## 15  0.6446584  0.21955234  0.1747569       2
## 16  0.8058230 -1.43864557 -1.2558804       2
## 17  0.9669876 -1.11509476 -1.0692756       2
## 18  1.1281521  0.05777693  0.1125553       2
## 19  1.2893167 -0.38710543 -0.3228561       2
## 20  1.4504813  0.46221544  0.7345716       2
## 21  1.6116459 -1.31731401 -1.0692756       2
## Count Cluster classification
count(germ_df_cl, cluster)
##   cluster  n
## 1       1  9
## 2       2 12
## compare with Original Dataset classification
count(germ_df, seed)
## # A tibble: 2 × 2
##   seed      n
##   <chr> <int>
## 1 O73      10
## 2 O75      11

Draw the plot

library(ggplot2)
##ggplot(seeds_df_cl, aes(x=seeds_df_cl$area, y=seeds_df_cl$perimeter , color=factor(cluster))) + geom_point()
ggplot(germ_df_cl, aes(x=n , y=y  , color=factor(cluster))) + geom_point()

#Use of `seeds_df_cl$perimeter` is discouraged.ℹ Use `perimeter` instead. 
##ggplot(seeds_df_cl, aes(x=area, y=perimeter , color=factor(cluster))) + geom_point()
ggplot(germ_df, aes(x=n, y=y , color=factor(germ_df$seed ))) + geom_point()

compute Dunn’s index

#Install clvalid
library(clValid)
## Loading required package: cluster
dunn(dist_mat, hac_cut)       
## [1] 0.3412918
dist_mat[1:21]
##  [1] 1.2430326 1.9087032 1.2082146 0.7779251 1.5897671 3.1768951 3.2998213
##  [8] 1.9417764 3.1201447 1.9243542 2.0058966 1.9679309 2.1454619 2.4090792
## [15] 2.8696339 2.8339659 2.8408059 2.9402045 3.3889204 3.5005936 0.7851519

###KNN

set.seed(1234)
germ_knn <- kmeans( germ_df[,-2], center=2, iter.max=1)
germ_knn
## K-means clustering with 2 clusters of sizes 12, 9
## 
## Cluster means:
##    rownames        n        y
## 1 12.750000 22.08333  9.25000
## 2  8.666667 62.88889 34.77778
## 
## Clustering vector:
##  [1] 1 2 2 2 1 1 2 2 2 2 1 1 1 1 2 1 1 1 1 2 1
## 
## Within cluster sum of squares by cluster:
## [1] 2979.417 3186.444
##  (between_SS / total_SS =  66.1 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
germ_knn$betweenss
## [1] 12000.52
table(germ_knn$cluster, germ_df$seed)
##    
##     O73 O75
##   1   8   4
##   2   2   7
cm <- table(germ_knn$cluster, germ_df$seed)
1 - sum(diag(cm))/sum(cm)
## [1] 0.2857143
####PREDICTION

prop.table(table(germ_df$seed))
## 
##       O73       O75 
## 0.4761905 0.5238095
#summary( wd1$radius_mean, wd1$area_mean, wd1$smoothness_mean)

summary(germ_df1[c("rownames", "n", "y")])
##     rownames        n               y        
##  Min.   : 1   Min.   : 4.00   Min.   : 0.00  
##  1st Qu.: 6   1st Qu.:16.00   1st Qu.: 8.00  
##  Median :11   Median :39.00   Median :17.00  
##  Mean   :11   Mean   :39.57   Mean   :20.19  
##  3rd Qu.:16   3rd Qu.:51.00   3rd Qu.:26.00  
##  Max.   :21   Max.   :81.00   Max.   :55.00
normalize <- function (x){
  return ((x - min(x))/(max(x) - min(x)))
}

germ_df1
## # A tibble: 21 × 3
##    rownames     n     y
##       <dbl> <dbl> <dbl>
##  1        1    39    10
##  2        2    62    23
##  3        3    81    23
##  4        4    51    26
##  5        5    39    17
##  6        6     6     5
##  7        7    74    53
##  8        8    72    55
##  9        9    51    32
## 10       10    79    46
## # ℹ 11 more rows
germ_df1_n <- as.data.frame(lapply(germ_df1,normalize))
germ_df1_n
##    rownames          n          y
## 1      0.00 0.45454545 0.18181818
## 2      0.05 0.75324675 0.41818182
## 3      0.10 1.00000000 0.41818182
## 4      0.15 0.61038961 0.47272727
## 5      0.20 0.45454545 0.30909091
## 6      0.25 0.02597403 0.09090909
## 7      0.30 0.90909091 0.96363636
## 8      0.35 0.88311688 1.00000000
## 9      0.40 0.61038961 0.58181818
## 10     0.45 0.97402597 0.83636364
## 11     0.50 0.11688312 0.18181818
## 12     0.55 0.15584416 0.14545455
## 13     0.60 0.33766234 0.18181818
## 14     0.65 0.31168831 0.14545455
## 15     0.70 0.53246753 0.41818182
## 16     0.75 0.00000000 0.00000000
## 17     0.80 0.10389610 0.05454545
## 18     0.85 0.48051948 0.40000000
## 19     0.90 0.33766234 0.27272727
## 20     0.95 0.61038961 0.58181818
## 21     1.00 0.03896104 0.05454545
summary(germ_df1_n[c("rownames", "n", "y")])
##     rownames          n                y         
##  Min.   :0.00   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.25   1st Qu.:0.1558   1st Qu.:0.1455  
##  Median :0.50   Median :0.4545   Median :0.3091  
##  Mean   :0.50   Mean   :0.4620   Mean   :0.3671  
##  3rd Qu.:0.75   3rd Qu.:0.6104   3rd Qu.:0.4727  
##  Max.   :1.00   Max.   :1.0000   Max.   :1.0000
germ_df$seed <- factor(germ_df$seed, levels=c(073, 075), labels=c("073", "075"))

str(germ_df$seed)
##  Factor w/ 2 levels "073","075": NA NA NA NA NA NA NA NA NA NA ...
dim(germ_df$seed)
## NULL
set.seed(1234)
germ_df_train1 <- germ_df1_n[1:9,]
germ_df_train2 <- germ_df1_n[14:21,]
germ_df_train  <- rbind(germ_df_train1, germ_df_train2)
germ_df_test  <- germ_df1_n[10:13,]



#table1 <- data.frame(ID = 1:3, Name = c("John", "Alice", "Bob"))
#table2 <- data.frame(ID = 4:6, Name = c("Charlie", "David", "Emily"))
#combined_table <- rbind(table1, table2)
#print(combined_table)



germ_df <- Germination

germ_df_train_labels1 <- germ_df[1:9,2]
germ_df_train_labels2 <- germ_df[14:21,2]
germ_df_train_labels <- rbind(germ_df_train_labels1, germ_df_train_labels2)
  
germ_df_test_labels <- germ_df[10:13,2]

germ_df_test_labels
## # A tibble: 4 × 1
##   seed 
##   <chr>
## 1 O75  
## 2 O75  
## 3 O73  
## 4 O73
unique(germ_df_train_labels)
## # A tibble: 2 × 1
##   seed 
##   <chr>
## 1 O75  
## 2 O73
germ_df_train_labels$seed
##  [1] "O75" "O75" "O75" "O75" "O75" "O75" "O75" "O75" "O75" "O73" "O73" "O73"
## [13] "O73" "O73" "O73" "O73" "O73"
dim(germ_df_train_labels)
## [1] 17  1
head(germ_df_train_labels)
## # A tibble: 6 × 1
##   seed 
##   <chr>
## 1 O75  
## 2 O75  
## 3 O75  
## 4 O75  
## 5 O75  
## 6 O75
head(germ_df_test_labels)
## # A tibble: 4 × 1
##   seed 
##   <chr>
## 1 O75  
## 2 O75  
## 3 O73  
## 4 O73
germ_df_test_labels
## # A tibble: 4 × 1
##   seed 
##   <chr>
## 1 O75  
## 2 O75  
## 3 O73  
## 4 O73
germ_df[,2]
## # A tibble: 21 × 1
##    seed 
##    <chr>
##  1 O75  
##  2 O75  
##  3 O75  
##  4 O75  
##  5 O75  
##  6 O75  
##  7 O75  
##  8 O75  
##  9 O75  
## 10 O75  
## # ℹ 11 more rows
print(germ_df$seed[1:21])
##  [1] "O75" "O75" "O75" "O75" "O75" "O75" "O75" "O75" "O75" "O75" "O75" "O73"
## [13] "O73" "O73" "O73" "O73" "O73" "O73" "O73" "O73" "O73"
library(class)
germ_df_test_pred <- knn(train=germ_df_train, test=germ_df_test,cl=germ_df_train_labels$seed, k=2)

germ_df_test_pred
## [1] O75 O73 O73 O73
## Levels: O73 O75
dim(germ_df_test_pred)
## NULL
germ_df_train
##    rownames          n          y
## 1      0.00 0.45454545 0.18181818
## 2      0.05 0.75324675 0.41818182
## 3      0.10 1.00000000 0.41818182
## 4      0.15 0.61038961 0.47272727
## 5      0.20 0.45454545 0.30909091
## 6      0.25 0.02597403 0.09090909
## 7      0.30 0.90909091 0.96363636
## 8      0.35 0.88311688 1.00000000
## 9      0.40 0.61038961 0.58181818
## 14     0.65 0.31168831 0.14545455
## 15     0.70 0.53246753 0.41818182
## 16     0.75 0.00000000 0.00000000
## 17     0.80 0.10389610 0.05454545
## 18     0.85 0.48051948 0.40000000
## 19     0.90 0.33766234 0.27272727
## 20     0.95 0.61038961 0.58181818
## 21     1.00 0.03896104 0.05454545
germ_df_test
##    rownames         n         y
## 10     0.45 0.9740260 0.8363636
## 11     0.50 0.1168831 0.1818182
## 12     0.55 0.1558442 0.1454545
## 13     0.60 0.3376623 0.1818182
germ_df_train_labels$seed
##  [1] "O75" "O75" "O75" "O75" "O75" "O75" "O75" "O75" "O75" "O73" "O73" "O73"
## [13] "O73" "O73" "O73" "O73" "O73"
#Instal package gmodels and import the library(gmodels)

germ_df_test_pred
## [1] O75 O73 O73 O73
## Levels: O73 O75
germ_df_test_labels$seed
## [1] "O75" "O75" "O73" "O73"
library(gmodels)
## Warning: package 'gmodels' was built under R version 4.3.3
CrossTable(x=germ_df_test_labels$seed , y=germ_df_test_pred ,  prop.chisq=FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  4 
## 
##  
##                          | germ_df_test_pred 
## germ_df_test_labels$seed |       O73 |       O75 | Row Total | 
## -------------------------|-----------|-----------|-----------|
##                      O73 |         2 |         0 |         2 | 
##                          |     1.000 |     0.000 |     0.500 | 
##                          |     0.667 |     0.000 |           | 
##                          |     0.500 |     0.000 |           | 
## -------------------------|-----------|-----------|-----------|
##                      O75 |         1 |         1 |         2 | 
##                          |     0.500 |     0.500 |     0.500 | 
##                          |     0.333 |     1.000 |           | 
##                          |     0.250 |     0.250 |           | 
## -------------------------|-----------|-----------|-----------|
##             Column Total |         3 |         1 |         4 | 
##                          |     0.750 |     0.250 |           | 
## -------------------------|-----------|-----------|-----------|
## 
##