This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(readr)
Germination <- read_csv("C:/GGTUAN/DREAMS/Yankee/TSU/MSc_TSU/Spring_2024/CS-583 Data Minning/Project_Data/Germination.csv")
## Rows: 21 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): seed
## dbl (3): rownames, n, y
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(Germination)
germ_df <- Germination
germ_df
## # A tibble: 21 × 4
## rownames seed n y
## <dbl> <chr> <dbl> <dbl>
## 1 1 O75 39 10
## 2 2 O75 62 23
## 3 3 O75 81 23
## 4 4 O75 51 26
## 5 5 O75 39 17
## 6 6 O75 6 5
## 7 7 O75 74 53
## 8 8 O75 72 55
## 9 9 O75 51 32
## 10 10 O75 79 46
## # ℹ 11 more rows
View(germ_df)
str(germ_df)
## spc_tbl_ [21 × 4] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ rownames: num [1:21] 1 2 3 4 5 6 7 8 9 10 ...
## $ seed : chr [1:21] "O75" "O75" "O75" "O75" ...
## $ n : num [1:21] 39 62 81 51 39 6 74 72 51 79 ...
## $ y : num [1:21] 10 23 23 26 17 5 53 55 32 46 ...
## - attr(*, "spec")=
## .. cols(
## .. rownames = col_double(),
## .. seed = col_character(),
## .. n = col_double(),
## .. y = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
dim(germ_df)
## [1] 21 4
germ_df$seed
## [1] "O75" "O75" "O75" "O75" "O75" "O75" "O75" "O75" "O75" "O75" "O75" "O73"
## [13] "O73" "O73" "O73" "O73" "O73" "O73" "O73" "O73" "O73"
table(germ_df$seed)
##
## O73 O75
## 10 11
prop.table(table(germ_df$seed))
##
## O73 O75
## 0.4761905 0.5238095
germ_df1 <- germ_df[,-2]
germ_df1
## # A tibble: 21 × 3
## rownames n y
## <dbl> <dbl> <dbl>
## 1 1 39 10
## 2 2 62 23
## 3 3 81 23
## 4 4 51 26
## 5 5 39 17
## 6 6 6 5
## 7 7 74 53
## 8 8 72 55
## 9 9 51 32
## 10 10 79 46
## # ℹ 11 more rows
dim(germ_df1)
## [1] 21 3
View(germ_df1)
germ_df1_scale <- as.data.frame(scale(germ_df1))
str(germ_df1_scale)
## 'data.frame': 21 obs. of 3 variables:
## $ rownames: num -1.612 -1.45 -1.289 -1.128 -0.967 ...
## $ n : num -0.0231 0.9071 1.6755 0.4622 -0.0231 ...
## $ y : num -0.634 0.175 0.175 0.361 -0.198 ...
View(germ_df1_scale)
dist_mat <- dist(germ_df1_scale, method="euclidean")
View(dist_mat)
h_germ_df1<- hclust(dist_mat, method='average')
#show summary of cluster
h_germ_df1
##
## Call:
## hclust(d = dist_mat, method = "average")
##
## Cluster method : average
## Distance : euclidean
## Number of objects: 21
##fixing margin too large error
#par(mar)
par(mar = c(1, 1, 1, 1))
plot(h_germ_df1)
##Checking out cluster … In this case, it is just 2
hca <- h_germ_df1
plot(hca)
rect.hclust(hca, k = 2, border = "red")
x <- rect.hclust(hca, h = 2, which = c(1,3), border = 2:5)
x
## [[1]]
## [1] 6 11 12 13 14 15 16 17 18 19 20 21
##
## [[2]]
## [1] 1 2 3 4 5 9
library(dendextend)
##
## ---------------------
## Welcome to dendextend version 1.17.1
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags:
## https://stackoverflow.com/questions/tagged/dendextend
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
##
## cutree
avg_dendogram_obj <- as.dendrogram(h_germ_df1)
avg_col_dendogram <- color_branches(avg_dendogram_obj, h=2)
plot(avg_col_dendogram)
hac_cut <- cutree(h_germ_df1, k=2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
germ_df_cl <- mutate(germ_df1_scale, cluster=hac_cut)
germ_df_cl
## rownames n y cluster
## 1 -1.6116459 -0.02311077 -0.6338642 1
## 2 -1.4504813 0.90709781 0.1747569 1
## 3 -1.2893167 1.67553098 0.1747569 1
## 4 -1.1281521 0.46221544 0.3613618 1
## 5 -0.9669876 -0.02311077 -0.1984528 1
## 6 -0.8058230 -1.35775786 -0.9448723 2
## 7 -0.6446584 1.39242402 2.0408057 1
## 8 -0.4834938 1.31153632 2.1652089 1
## 9 -0.3223292 0.46221544 0.7345716 1
## 10 -0.1611646 1.59464328 1.6053943 1
## 11 0.0000000 -1.07465091 -0.6338642 2
## 12 0.1611646 -0.95331935 -0.7582674 2
## 13 0.3223292 -0.38710543 -0.6338642 2
## 14 0.4834938 -0.46799314 -0.7582674 2
## 15 0.6446584 0.21955234 0.1747569 2
## 16 0.8058230 -1.43864557 -1.2558804 2
## 17 0.9669876 -1.11509476 -1.0692756 2
## 18 1.1281521 0.05777693 0.1125553 2
## 19 1.2893167 -0.38710543 -0.3228561 2
## 20 1.4504813 0.46221544 0.7345716 2
## 21 1.6116459 -1.31731401 -1.0692756 2
## Count Cluster classification
count(germ_df_cl, cluster)
## cluster n
## 1 1 9
## 2 2 12
## compare with Original Dataset classification
count(germ_df, seed)
## # A tibble: 2 × 2
## seed n
## <chr> <int>
## 1 O73 10
## 2 O75 11
library(ggplot2)
##ggplot(seeds_df_cl, aes(x=seeds_df_cl$area, y=seeds_df_cl$perimeter , color=factor(cluster))) + geom_point()
ggplot(germ_df_cl, aes(x=n , y=y , color=factor(cluster))) + geom_point()
#Use of `seeds_df_cl$perimeter` is discouraged.ℹ Use `perimeter` instead.
##ggplot(seeds_df_cl, aes(x=area, y=perimeter , color=factor(cluster))) + geom_point()
ggplot(germ_df, aes(x=n, y=y , color=factor(germ_df$seed ))) + geom_point()
#Install clvalid
library(clValid)
## Loading required package: cluster
dunn(dist_mat, hac_cut)
## [1] 0.3412918
dist_mat[1:21]
## [1] 1.2430326 1.9087032 1.2082146 0.7779251 1.5897671 3.1768951 3.2998213
## [8] 1.9417764 3.1201447 1.9243542 2.0058966 1.9679309 2.1454619 2.4090792
## [15] 2.8696339 2.8339659 2.8408059 2.9402045 3.3889204 3.5005936 0.7851519
###KNN
set.seed(1234)
germ_knn <- kmeans( germ_df[,-2], center=2, iter.max=1)
germ_knn
## K-means clustering with 2 clusters of sizes 12, 9
##
## Cluster means:
## rownames n y
## 1 12.750000 22.08333 9.25000
## 2 8.666667 62.88889 34.77778
##
## Clustering vector:
## [1] 1 2 2 2 1 1 2 2 2 2 1 1 1 1 2 1 1 1 1 2 1
##
## Within cluster sum of squares by cluster:
## [1] 2979.417 3186.444
## (between_SS / total_SS = 66.1 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
germ_knn$betweenss
## [1] 12000.52
table(germ_knn$cluster, germ_df$seed)
##
## O73 O75
## 1 8 4
## 2 2 7
cm <- table(germ_knn$cluster, germ_df$seed)
1 - sum(diag(cm))/sum(cm)
## [1] 0.2857143
####PREDICTION
prop.table(table(germ_df$seed))
##
## O73 O75
## 0.4761905 0.5238095
#summary( wd1$radius_mean, wd1$area_mean, wd1$smoothness_mean)
summary(germ_df1[c("rownames", "n", "y")])
## rownames n y
## Min. : 1 Min. : 4.00 Min. : 0.00
## 1st Qu.: 6 1st Qu.:16.00 1st Qu.: 8.00
## Median :11 Median :39.00 Median :17.00
## Mean :11 Mean :39.57 Mean :20.19
## 3rd Qu.:16 3rd Qu.:51.00 3rd Qu.:26.00
## Max. :21 Max. :81.00 Max. :55.00
normalize <- function (x){
return ((x - min(x))/(max(x) - min(x)))
}
germ_df1
## # A tibble: 21 × 3
## rownames n y
## <dbl> <dbl> <dbl>
## 1 1 39 10
## 2 2 62 23
## 3 3 81 23
## 4 4 51 26
## 5 5 39 17
## 6 6 6 5
## 7 7 74 53
## 8 8 72 55
## 9 9 51 32
## 10 10 79 46
## # ℹ 11 more rows
germ_df1_n <- as.data.frame(lapply(germ_df1,normalize))
germ_df1_n
## rownames n y
## 1 0.00 0.45454545 0.18181818
## 2 0.05 0.75324675 0.41818182
## 3 0.10 1.00000000 0.41818182
## 4 0.15 0.61038961 0.47272727
## 5 0.20 0.45454545 0.30909091
## 6 0.25 0.02597403 0.09090909
## 7 0.30 0.90909091 0.96363636
## 8 0.35 0.88311688 1.00000000
## 9 0.40 0.61038961 0.58181818
## 10 0.45 0.97402597 0.83636364
## 11 0.50 0.11688312 0.18181818
## 12 0.55 0.15584416 0.14545455
## 13 0.60 0.33766234 0.18181818
## 14 0.65 0.31168831 0.14545455
## 15 0.70 0.53246753 0.41818182
## 16 0.75 0.00000000 0.00000000
## 17 0.80 0.10389610 0.05454545
## 18 0.85 0.48051948 0.40000000
## 19 0.90 0.33766234 0.27272727
## 20 0.95 0.61038961 0.58181818
## 21 1.00 0.03896104 0.05454545
summary(germ_df1_n[c("rownames", "n", "y")])
## rownames n y
## Min. :0.00 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.25 1st Qu.:0.1558 1st Qu.:0.1455
## Median :0.50 Median :0.4545 Median :0.3091
## Mean :0.50 Mean :0.4620 Mean :0.3671
## 3rd Qu.:0.75 3rd Qu.:0.6104 3rd Qu.:0.4727
## Max. :1.00 Max. :1.0000 Max. :1.0000
germ_df$seed <- factor(germ_df$seed, levels=c(073, 075), labels=c("073", "075"))
str(germ_df$seed)
## Factor w/ 2 levels "073","075": NA NA NA NA NA NA NA NA NA NA ...
dim(germ_df$seed)
## NULL
set.seed(1234)
germ_df_train1 <- germ_df1_n[1:9,]
germ_df_train2 <- germ_df1_n[14:21,]
germ_df_train <- rbind(germ_df_train1, germ_df_train2)
germ_df_test <- germ_df1_n[10:13,]
#table1 <- data.frame(ID = 1:3, Name = c("John", "Alice", "Bob"))
#table2 <- data.frame(ID = 4:6, Name = c("Charlie", "David", "Emily"))
#combined_table <- rbind(table1, table2)
#print(combined_table)
germ_df <- Germination
germ_df_train_labels1 <- germ_df[1:9,2]
germ_df_train_labels2 <- germ_df[14:21,2]
germ_df_train_labels <- rbind(germ_df_train_labels1, germ_df_train_labels2)
germ_df_test_labels <- germ_df[10:13,2]
germ_df_test_labels
## # A tibble: 4 × 1
## seed
## <chr>
## 1 O75
## 2 O75
## 3 O73
## 4 O73
unique(germ_df_train_labels)
## # A tibble: 2 × 1
## seed
## <chr>
## 1 O75
## 2 O73
germ_df_train_labels$seed
## [1] "O75" "O75" "O75" "O75" "O75" "O75" "O75" "O75" "O75" "O73" "O73" "O73"
## [13] "O73" "O73" "O73" "O73" "O73"
dim(germ_df_train_labels)
## [1] 17 1
head(germ_df_train_labels)
## # A tibble: 6 × 1
## seed
## <chr>
## 1 O75
## 2 O75
## 3 O75
## 4 O75
## 5 O75
## 6 O75
head(germ_df_test_labels)
## # A tibble: 4 × 1
## seed
## <chr>
## 1 O75
## 2 O75
## 3 O73
## 4 O73
germ_df_test_labels
## # A tibble: 4 × 1
## seed
## <chr>
## 1 O75
## 2 O75
## 3 O73
## 4 O73
germ_df[,2]
## # A tibble: 21 × 1
## seed
## <chr>
## 1 O75
## 2 O75
## 3 O75
## 4 O75
## 5 O75
## 6 O75
## 7 O75
## 8 O75
## 9 O75
## 10 O75
## # ℹ 11 more rows
print(germ_df$seed[1:21])
## [1] "O75" "O75" "O75" "O75" "O75" "O75" "O75" "O75" "O75" "O75" "O75" "O73"
## [13] "O73" "O73" "O73" "O73" "O73" "O73" "O73" "O73" "O73"
library(class)
germ_df_test_pred <- knn(train=germ_df_train, test=germ_df_test,cl=germ_df_train_labels$seed, k=2)
germ_df_test_pred
## [1] O75 O73 O73 O73
## Levels: O73 O75
dim(germ_df_test_pred)
## NULL
germ_df_train
## rownames n y
## 1 0.00 0.45454545 0.18181818
## 2 0.05 0.75324675 0.41818182
## 3 0.10 1.00000000 0.41818182
## 4 0.15 0.61038961 0.47272727
## 5 0.20 0.45454545 0.30909091
## 6 0.25 0.02597403 0.09090909
## 7 0.30 0.90909091 0.96363636
## 8 0.35 0.88311688 1.00000000
## 9 0.40 0.61038961 0.58181818
## 14 0.65 0.31168831 0.14545455
## 15 0.70 0.53246753 0.41818182
## 16 0.75 0.00000000 0.00000000
## 17 0.80 0.10389610 0.05454545
## 18 0.85 0.48051948 0.40000000
## 19 0.90 0.33766234 0.27272727
## 20 0.95 0.61038961 0.58181818
## 21 1.00 0.03896104 0.05454545
germ_df_test
## rownames n y
## 10 0.45 0.9740260 0.8363636
## 11 0.50 0.1168831 0.1818182
## 12 0.55 0.1558442 0.1454545
## 13 0.60 0.3376623 0.1818182
germ_df_train_labels$seed
## [1] "O75" "O75" "O75" "O75" "O75" "O75" "O75" "O75" "O75" "O73" "O73" "O73"
## [13] "O73" "O73" "O73" "O73" "O73"
#Instal package gmodels and import the library(gmodels)
germ_df_test_pred
## [1] O75 O73 O73 O73
## Levels: O73 O75
germ_df_test_labels$seed
## [1] "O75" "O75" "O73" "O73"
library(gmodels)
## Warning: package 'gmodels' was built under R version 4.3.3
CrossTable(x=germ_df_test_labels$seed , y=germ_df_test_pred , prop.chisq=FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 4
##
##
## | germ_df_test_pred
## germ_df_test_labels$seed | O73 | O75 | Row Total |
## -------------------------|-----------|-----------|-----------|
## O73 | 2 | 0 | 2 |
## | 1.000 | 0.000 | 0.500 |
## | 0.667 | 0.000 | |
## | 0.500 | 0.000 | |
## -------------------------|-----------|-----------|-----------|
## O75 | 1 | 1 | 2 |
## | 0.500 | 0.500 | 0.500 |
## | 0.333 | 1.000 | |
## | 0.250 | 0.250 | |
## -------------------------|-----------|-----------|-----------|
## Column Total | 3 | 1 | 4 |
## | 0.750 | 0.250 | |
## -------------------------|-----------|-----------|-----------|
##
##